1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #include "Python.h"
42 #include "pycore_abstract.h" // _PyIndex_Check()
43 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
44 #include "pycore_bytesobject.h" // _PyBytes_Repeat()
45 #include "pycore_ceval.h" // _PyEval_GetBuiltin()
46 #include "pycore_codecs.h" // _PyCodec_Lookup()
47 #include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48 #include "pycore_format.h" // F_LJUST
49 #include "pycore_initconfig.h" // _PyStatus_OK()
50 #include "pycore_interp.h" // PyInterpreterState.fs_codec
51 #include "pycore_long.h" // _PyLong_FormatWriter()
52 #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53 #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
54 #include "pycore_pyerrors.h" // _PyUnicodeTranslateError_Create()
55 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
56 #include "pycore_pystate.h" // _PyInterpreterState_GET()
57 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
58 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
59 #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
60
61 #include "stringlib/eq.h" // unicode_eq()
62 #include <stddef.h> // ptrdiff_t
63
64 #ifdef MS_WINDOWS
65 #include <windows.h>
66 #endif
67
68 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
69 # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
70 #endif
71
72 /* Uncomment to display statistics on interned strings at exit
73 in _PyUnicode_ClearInterned(). */
74 /* #define INTERNED_STATS 1 */
75
76
77 /*[clinic input]
78 class str "PyObject *" "&PyUnicode_Type"
79 [clinic start generated code]*/
80 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
81
82 /*[python input]
83 class Py_UCS4_converter(CConverter):
84 type = 'Py_UCS4'
85 converter = 'convert_uc'
86
87 def converter_init(self):
88 if self.default is not unspecified:
89 self.c_default = ascii(self.default)
90 if len(self.c_default) > 4 or self.c_default[0] != "'":
91 self.c_default = hex(ord(self.default))
92
93 [python start generated code]*/
94 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
95
96 /* --- Globals ------------------------------------------------------------
97
98 NOTE: In the interpreter's initialization phase, some globals are currently
99 initialized dynamically as needed. In the process Unicode objects may
100 be created before the Unicode type is ready.
101
102 */
103
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107
108 #ifdef Py_DEBUG
109 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113
114 #define _PyUnicode_UTF8(op) \
115 (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((char*)(_PyASCIIObject_CAST(op) + 1)) : \
120 _PyUnicode_UTF8(op))
121 #define _PyUnicode_UTF8_LENGTH(op) \
122 (_PyCompactUnicodeObject_CAST(op)->utf8_length)
123 #define PyUnicode_UTF8_LENGTH(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 PyUnicode_IS_COMPACT_ASCII(op) ? \
126 _PyASCIIObject_CAST(op)->length : \
127 _PyUnicode_UTF8_LENGTH(op))
128
129 #define _PyUnicode_LENGTH(op) \
130 (_PyASCIIObject_CAST(op)->length)
131 #define _PyUnicode_STATE(op) \
132 (_PyASCIIObject_CAST(op)->state)
133 #define _PyUnicode_HASH(op) \
134 (_PyASCIIObject_CAST(op)->hash)
135 #define _PyUnicode_KIND(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 _PyASCIIObject_CAST(op)->state.kind)
138 #define _PyUnicode_GET_LENGTH(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 _PyASCIIObject_CAST(op)->length)
141 #define _PyUnicode_DATA_ANY(op) \
142 (_PyUnicodeObject_CAST(op)->data.any)
143
144 #define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148
149 /* true if the Unicode object has an allocated UTF-8 memory block
150 (not shared with other data) */
151 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
152 ((!PyUnicode_IS_COMPACT_ASCII(op) \
153 && _PyUnicode_UTF8(op) \
154 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
155
156 /* Generic helper macro to convert characters of different types.
157 from_type and to_type have to be valid type names, begin and end
158 are pointers to the source characters which should be of type
159 "from_type *". to is a pointer of type "to_type *" and points to the
160 buffer where the result characters are written to. */
161 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162 do { \
163 to_type *_to = (to_type *)(to); \
164 const from_type *_iter = (const from_type *)(begin);\
165 const from_type *_end = (const from_type *)(end);\
166 Py_ssize_t n = (_end) - (_iter); \
167 const from_type *_unrolled_end = \
168 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
169 while (_iter < (_unrolled_end)) { \
170 _to[0] = (to_type) _iter[0]; \
171 _to[1] = (to_type) _iter[1]; \
172 _to[2] = (to_type) _iter[2]; \
173 _to[3] = (to_type) _iter[3]; \
174 _iter += 4; _to += 4; \
175 } \
176 while (_iter < (_end)) \
177 *_to++ = (to_type) *_iter++; \
178 } while (0)
179
180 #define LATIN1 _Py_LATIN1_CHR
181
182 #ifdef MS_WINDOWS
183 /* On Windows, overallocate by 50% is the best factor */
184 # define OVERALLOCATE_FACTOR 2
185 #else
186 /* On Linux, overallocate by 25% is the best factor */
187 # define OVERALLOCATE_FACTOR 4
188 #endif
189
190 /* Forward declaration */
191 static inline int
192 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
193 static inline void
194 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
195 static PyObject *
196 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
197 const char *errors);
198 static PyObject *
199 unicode_decode_utf8(const char *s, Py_ssize_t size,
200 _Py_error_handler error_handler, const char *errors,
201 Py_ssize_t *consumed);
202 #ifdef Py_DEBUG
203 static inline int unicode_is_finalizing(void);
204 static int unicode_is_singleton(PyObject *unicode);
205 #endif
206
207
208 // Return a reference to the immortal empty string singleton.
unicode_get_empty(void)209 static inline PyObject* unicode_get_empty(void)
210 {
211 _Py_DECLARE_STR(empty, "");
212 return &_Py_STR(empty);
213 }
214
215 /* This dictionary holds per-interpreter interned strings.
216 * See InternalDocs/string_interning.md for details.
217 */
get_interned_dict(PyInterpreterState * interp)218 static inline PyObject *get_interned_dict(PyInterpreterState *interp)
219 {
220 return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
221 }
222
223 /* This hashtable holds statically allocated interned strings.
224 * See InternalDocs/string_interning.md for details.
225 */
226 #define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
227
228 /* Get number of all interned strings for the current interpreter. */
229 Py_ssize_t
_PyUnicode_InternedSize(void)230 _PyUnicode_InternedSize(void)
231 {
232 PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
233 return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
234 }
235
236 /* Get number of immortal interned strings for the current interpreter. */
237 Py_ssize_t
_PyUnicode_InternedSize_Immortal(void)238 _PyUnicode_InternedSize_Immortal(void)
239 {
240 PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
241 PyObject *key, *value;
242 Py_ssize_t pos = 0;
243 Py_ssize_t count = 0;
244
245 // It's tempting to keep a count and avoid a loop here. But, this function
246 // is intended for refleak tests. It spends extra work to report the true
247 // value, to help detect bugs in optimizations.
248
249 while (PyDict_Next(dict, &pos, &key, &value)) {
250 assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
251 if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
252 count++;
253 }
254 }
255 return _Py_hashtable_len(INTERNED_STRINGS) + count;
256 }
257
258 static Py_hash_t unicode_hash(PyObject *);
259 static int unicode_compare_eq(PyObject *, PyObject *);
260
261 static Py_uhash_t
hashtable_unicode_hash(const void * key)262 hashtable_unicode_hash(const void *key)
263 {
264 return unicode_hash((PyObject *)key);
265 }
266
267 static int
hashtable_unicode_compare(const void * key1,const void * key2)268 hashtable_unicode_compare(const void *key1, const void *key2)
269 {
270 PyObject *obj1 = (PyObject *)key1;
271 PyObject *obj2 = (PyObject *)key2;
272 if (obj1 != NULL && obj2 != NULL) {
273 return unicode_compare_eq(obj1, obj2);
274 }
275 else {
276 return obj1 == obj2;
277 }
278 }
279
280 /* Return true if this interpreter should share the main interpreter's
281 intern_dict. That's important for interpreters which load basic
282 single-phase init extension modules (m_size == -1). There could be interned
283 immortal strings that are shared between interpreters, due to the
284 PyDict_Update(mdict, m_copy) call in import_find_extension().
285
286 It's not safe to deallocate those strings until all interpreters that
287 potentially use them are freed. By storing them in the main interpreter, we
288 ensure they get freed after all other interpreters are freed.
289 */
290 static bool
has_shared_intern_dict(PyInterpreterState * interp)291 has_shared_intern_dict(PyInterpreterState *interp)
292 {
293 PyInterpreterState *main_interp = _PyInterpreterState_Main();
294 return interp != main_interp && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
295 }
296
297 static int
init_interned_dict(PyInterpreterState * interp)298 init_interned_dict(PyInterpreterState *interp)
299 {
300 assert(get_interned_dict(interp) == NULL);
301 PyObject *interned;
302 if (has_shared_intern_dict(interp)) {
303 interned = get_interned_dict(_PyInterpreterState_Main());
304 Py_INCREF(interned);
305 }
306 else {
307 interned = PyDict_New();
308 if (interned == NULL) {
309 return -1;
310 }
311 }
312 _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
313 return 0;
314 }
315
316 static void
clear_interned_dict(PyInterpreterState * interp)317 clear_interned_dict(PyInterpreterState *interp)
318 {
319 PyObject *interned = get_interned_dict(interp);
320 if (interned != NULL) {
321 if (!has_shared_intern_dict(interp)) {
322 // only clear if the dict belongs to this interpreter
323 PyDict_Clear(interned);
324 }
325 Py_DECREF(interned);
326 _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
327 }
328 }
329
330 static PyStatus
init_global_interned_strings(PyInterpreterState * interp)331 init_global_interned_strings(PyInterpreterState *interp)
332 {
333 assert(INTERNED_STRINGS == NULL);
334 _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
335
336 INTERNED_STRINGS = _Py_hashtable_new_full(
337 hashtable_unicode_hash,
338 hashtable_unicode_compare,
339 // Objects stored here are immortal and statically allocated,
340 // so we don't need key_destroy_func & value_destroy_func:
341 NULL,
342 NULL,
343 &hashtable_alloc
344 );
345 if (INTERNED_STRINGS == NULL) {
346 PyErr_Clear();
347 return _PyStatus_ERR("failed to create global interned dict");
348 }
349
350 /* Intern statically allocated string identifiers, deepfreeze strings,
351 * and one-byte latin-1 strings.
352 * This must be done before any module initialization so that statically
353 * allocated string identifiers are used instead of heap allocated strings.
354 * Deepfreeze uses the interned identifiers if present to save space
355 * else generates them and they are interned to speed up dict lookups.
356 */
357 _PyUnicode_InitStaticStrings(interp);
358
359 for (int i = 0; i < 256; i++) {
360 PyObject *s = LATIN1(i);
361 _PyUnicode_InternStatic(interp, &s);
362 assert(s == LATIN1(i));
363 }
364 #ifdef Py_DEBUG
365 assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
366
367 for (int i = 0; i < 256; i++) {
368 assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
369 }
370 #endif
371 return _PyStatus_OK();
372 }
373
clear_global_interned_strings(void)374 static void clear_global_interned_strings(void)
375 {
376 if (INTERNED_STRINGS != NULL) {
377 _Py_hashtable_destroy(INTERNED_STRINGS);
378 INTERNED_STRINGS = NULL;
379 }
380 }
381
382 #define _Py_RETURN_UNICODE_EMPTY() \
383 do { \
384 return unicode_get_empty(); \
385 } while (0)
386
387 static inline void
unicode_fill(int kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)388 unicode_fill(int kind, void *data, Py_UCS4 value,
389 Py_ssize_t start, Py_ssize_t length)
390 {
391 assert(0 <= start);
392 switch (kind) {
393 case PyUnicode_1BYTE_KIND: {
394 assert(value <= 0xff);
395 Py_UCS1 ch = (unsigned char)value;
396 Py_UCS1 *to = (Py_UCS1 *)data + start;
397 memset(to, ch, length);
398 break;
399 }
400 case PyUnicode_2BYTE_KIND: {
401 assert(value <= 0xffff);
402 Py_UCS2 ch = (Py_UCS2)value;
403 Py_UCS2 *to = (Py_UCS2 *)data + start;
404 const Py_UCS2 *end = to + length;
405 for (; to < end; ++to) *to = ch;
406 break;
407 }
408 case PyUnicode_4BYTE_KIND: {
409 assert(value <= MAX_UNICODE);
410 Py_UCS4 ch = value;
411 Py_UCS4 * to = (Py_UCS4 *)data + start;
412 const Py_UCS4 *end = to + length;
413 for (; to < end; ++to) *to = ch;
414 break;
415 }
416 default: Py_UNREACHABLE();
417 }
418 }
419
420
421 /* Fast detection of the most frequent whitespace characters */
422 const unsigned char _Py_ascii_whitespace[] = {
423 0, 0, 0, 0, 0, 0, 0, 0,
424 /* case 0x0009: * CHARACTER TABULATION */
425 /* case 0x000A: * LINE FEED */
426 /* case 0x000B: * LINE TABULATION */
427 /* case 0x000C: * FORM FEED */
428 /* case 0x000D: * CARRIAGE RETURN */
429 0, 1, 1, 1, 1, 1, 0, 0,
430 0, 0, 0, 0, 0, 0, 0, 0,
431 /* case 0x001C: * FILE SEPARATOR */
432 /* case 0x001D: * GROUP SEPARATOR */
433 /* case 0x001E: * RECORD SEPARATOR */
434 /* case 0x001F: * UNIT SEPARATOR */
435 0, 0, 0, 0, 1, 1, 1, 1,
436 /* case 0x0020: * SPACE */
437 1, 0, 0, 0, 0, 0, 0, 0,
438 0, 0, 0, 0, 0, 0, 0, 0,
439 0, 0, 0, 0, 0, 0, 0, 0,
440 0, 0, 0, 0, 0, 0, 0, 0,
441
442 0, 0, 0, 0, 0, 0, 0, 0,
443 0, 0, 0, 0, 0, 0, 0, 0,
444 0, 0, 0, 0, 0, 0, 0, 0,
445 0, 0, 0, 0, 0, 0, 0, 0,
446 0, 0, 0, 0, 0, 0, 0, 0,
447 0, 0, 0, 0, 0, 0, 0, 0,
448 0, 0, 0, 0, 0, 0, 0, 0,
449 0, 0, 0, 0, 0, 0, 0, 0
450 };
451
452 /* forward */
453 static PyObject* get_latin1_char(unsigned char ch);
454 static int unicode_modifiable(PyObject *unicode);
455
456
457 static PyObject *
458 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
459 static PyObject *
460 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
461 static PyObject *
462 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
463
464 static PyObject *
465 unicode_encode_call_errorhandler(const char *errors,
466 PyObject **errorHandler,const char *encoding, const char *reason,
467 PyObject *unicode, PyObject **exceptionObject,
468 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
469
470 static void
471 raise_encode_exception(PyObject **exceptionObject,
472 const char *encoding,
473 PyObject *unicode,
474 Py_ssize_t startpos, Py_ssize_t endpos,
475 const char *reason);
476
477 /* Same for linebreaks */
478 static const unsigned char ascii_linebreak[] = {
479 0, 0, 0, 0, 0, 0, 0, 0,
480 /* 0x000A, * LINE FEED */
481 /* 0x000B, * LINE TABULATION */
482 /* 0x000C, * FORM FEED */
483 /* 0x000D, * CARRIAGE RETURN */
484 0, 0, 1, 1, 1, 1, 0, 0,
485 0, 0, 0, 0, 0, 0, 0, 0,
486 /* 0x001C, * FILE SEPARATOR */
487 /* 0x001D, * GROUP SEPARATOR */
488 /* 0x001E, * RECORD SEPARATOR */
489 0, 0, 0, 0, 1, 1, 1, 0,
490 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0,
494
495 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0,
497 0, 0, 0, 0, 0, 0, 0, 0,
498 0, 0, 0, 0, 0, 0, 0, 0,
499 0, 0, 0, 0, 0, 0, 0, 0,
500 0, 0, 0, 0, 0, 0, 0, 0,
501 0, 0, 0, 0, 0, 0, 0, 0,
502 0, 0, 0, 0, 0, 0, 0, 0
503 };
504
505 static int convert_uc(PyObject *obj, void *addr);
506
507 struct encoding_map;
508 #include "clinic/unicodeobject.c.h"
509
510 _Py_error_handler
_Py_GetErrorHandler(const char * errors)511 _Py_GetErrorHandler(const char *errors)
512 {
513 if (errors == NULL || strcmp(errors, "strict") == 0) {
514 return _Py_ERROR_STRICT;
515 }
516 if (strcmp(errors, "surrogateescape") == 0) {
517 return _Py_ERROR_SURROGATEESCAPE;
518 }
519 if (strcmp(errors, "replace") == 0) {
520 return _Py_ERROR_REPLACE;
521 }
522 if (strcmp(errors, "ignore") == 0) {
523 return _Py_ERROR_IGNORE;
524 }
525 if (strcmp(errors, "backslashreplace") == 0) {
526 return _Py_ERROR_BACKSLASHREPLACE;
527 }
528 if (strcmp(errors, "surrogatepass") == 0) {
529 return _Py_ERROR_SURROGATEPASS;
530 }
531 if (strcmp(errors, "xmlcharrefreplace") == 0) {
532 return _Py_ERROR_XMLCHARREFREPLACE;
533 }
534 return _Py_ERROR_OTHER;
535 }
536
537
538 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)539 get_error_handler_wide(const wchar_t *errors)
540 {
541 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
542 return _Py_ERROR_STRICT;
543 }
544 if (wcscmp(errors, L"surrogateescape") == 0) {
545 return _Py_ERROR_SURROGATEESCAPE;
546 }
547 if (wcscmp(errors, L"replace") == 0) {
548 return _Py_ERROR_REPLACE;
549 }
550 if (wcscmp(errors, L"ignore") == 0) {
551 return _Py_ERROR_IGNORE;
552 }
553 if (wcscmp(errors, L"backslashreplace") == 0) {
554 return _Py_ERROR_BACKSLASHREPLACE;
555 }
556 if (wcscmp(errors, L"surrogatepass") == 0) {
557 return _Py_ERROR_SURROGATEPASS;
558 }
559 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
560 return _Py_ERROR_XMLCHARREFREPLACE;
561 }
562 return _Py_ERROR_OTHER;
563 }
564
565
566 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)567 unicode_check_encoding_errors(const char *encoding, const char *errors)
568 {
569 if (encoding == NULL && errors == NULL) {
570 return 0;
571 }
572
573 PyInterpreterState *interp = _PyInterpreterState_GET();
574 #ifndef Py_DEBUG
575 /* In release mode, only check in development mode (-X dev) */
576 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
577 return 0;
578 }
579 #else
580 /* Always check in debug mode */
581 #endif
582
583 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
584 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
585 if (!interp->unicode.fs_codec.encoding) {
586 return 0;
587 }
588
589 /* Disable checks during Python finalization. For example, it allows to
590 call _PyObject_Dump() during finalization for debugging purpose. */
591 if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
592 return 0;
593 }
594
595 if (encoding != NULL
596 // Fast path for the most common built-in encodings. Even if the codec
597 // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
598 // create a temporary Unicode string (the key in the cache).
599 && strcmp(encoding, "utf-8") != 0
600 && strcmp(encoding, "utf8") != 0
601 && strcmp(encoding, "ascii") != 0)
602 {
603 PyObject *handler = _PyCodec_Lookup(encoding);
604 if (handler == NULL) {
605 return -1;
606 }
607 Py_DECREF(handler);
608 }
609
610 if (errors != NULL
611 // Fast path for the most common built-in error handlers.
612 && strcmp(errors, "strict") != 0
613 && strcmp(errors, "ignore") != 0
614 && strcmp(errors, "replace") != 0
615 && strcmp(errors, "surrogateescape") != 0
616 && strcmp(errors, "surrogatepass") != 0)
617 {
618 PyObject *handler = PyCodec_LookupError(errors);
619 if (handler == NULL) {
620 return -1;
621 }
622 Py_DECREF(handler);
623 }
624 return 0;
625 }
626
627
628 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)629 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
630 {
631 #define CHECK(expr) \
632 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
633
634 assert(op != NULL);
635 CHECK(PyUnicode_Check(op));
636
637 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
638 int kind = ascii->state.kind;
639
640 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
641 CHECK(kind == PyUnicode_1BYTE_KIND);
642 }
643 else {
644 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
645 void *data;
646
647 if (ascii->state.compact == 1) {
648 data = compact + 1;
649 CHECK(kind == PyUnicode_1BYTE_KIND
650 || kind == PyUnicode_2BYTE_KIND
651 || kind == PyUnicode_4BYTE_KIND);
652 CHECK(ascii->state.ascii == 0);
653 CHECK(compact->utf8 != data);
654 }
655 else {
656 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
657
658 data = unicode->data.any;
659 CHECK(kind == PyUnicode_1BYTE_KIND
660 || kind == PyUnicode_2BYTE_KIND
661 || kind == PyUnicode_4BYTE_KIND);
662 CHECK(ascii->state.compact == 0);
663 CHECK(data != NULL);
664 if (ascii->state.ascii) {
665 CHECK(compact->utf8 == data);
666 CHECK(compact->utf8_length == ascii->length);
667 }
668 else {
669 CHECK(compact->utf8 != data);
670 }
671 }
672
673 if (compact->utf8 == NULL)
674 CHECK(compact->utf8_length == 0);
675 }
676
677 /* check that the best kind is used: O(n) operation */
678 if (check_content) {
679 Py_ssize_t i;
680 Py_UCS4 maxchar = 0;
681 const void *data;
682 Py_UCS4 ch;
683
684 data = PyUnicode_DATA(ascii);
685 for (i=0; i < ascii->length; i++)
686 {
687 ch = PyUnicode_READ(kind, data, i);
688 if (ch > maxchar)
689 maxchar = ch;
690 }
691 if (kind == PyUnicode_1BYTE_KIND) {
692 if (ascii->state.ascii == 0) {
693 CHECK(maxchar >= 128);
694 CHECK(maxchar <= 255);
695 }
696 else
697 CHECK(maxchar < 128);
698 }
699 else if (kind == PyUnicode_2BYTE_KIND) {
700 CHECK(maxchar >= 0x100);
701 CHECK(maxchar <= 0xFFFF);
702 }
703 else {
704 CHECK(maxchar >= 0x10000);
705 CHECK(maxchar <= MAX_UNICODE);
706 }
707 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
708 }
709
710 /* Check interning state */
711 #ifdef Py_DEBUG
712 // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
713 // extensions can make immortal strings mortal (but with a high enough
714 // refcount).
715 // The other way is extremely unlikely (worth a potential failed assertion
716 // in a debug build), so we do check `!_Py_IsImmortal(op)`.
717 switch (PyUnicode_CHECK_INTERNED(op)) {
718 case SSTATE_NOT_INTERNED:
719 if (ascii->state.statically_allocated) {
720 // This state is for two exceptions:
721 // - strings are currently checked before they're interned
722 // - the 256 one-latin1-character strings
723 // are static but use SSTATE_NOT_INTERNED
724 }
725 else {
726 CHECK(!_Py_IsImmortal(op));
727 }
728 break;
729 case SSTATE_INTERNED_MORTAL:
730 CHECK(!ascii->state.statically_allocated);
731 CHECK(!_Py_IsImmortal(op));
732 break;
733 case SSTATE_INTERNED_IMMORTAL:
734 CHECK(!ascii->state.statically_allocated);
735 break;
736 case SSTATE_INTERNED_IMMORTAL_STATIC:
737 CHECK(ascii->state.statically_allocated);
738 break;
739 default:
740 Py_UNREACHABLE();
741 }
742 #endif
743
744 return 1;
745
746 #undef CHECK
747 }
748
749 static PyObject*
unicode_result(PyObject * unicode)750 unicode_result(PyObject *unicode)
751 {
752 assert(_PyUnicode_CHECK(unicode));
753
754 Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
755 if (length == 0) {
756 PyObject *empty = unicode_get_empty();
757 if (unicode != empty) {
758 Py_DECREF(unicode);
759 }
760 return empty;
761 }
762
763 if (length == 1) {
764 int kind = PyUnicode_KIND(unicode);
765 if (kind == PyUnicode_1BYTE_KIND) {
766 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
767 Py_UCS1 ch = data[0];
768 PyObject *latin1_char = LATIN1(ch);
769 if (unicode != latin1_char) {
770 Py_DECREF(unicode);
771 }
772 return latin1_char;
773 }
774 }
775
776 assert(_PyUnicode_CheckConsistency(unicode, 1));
777 return unicode;
778 }
779
780 static PyObject*
unicode_result_unchanged(PyObject * unicode)781 unicode_result_unchanged(PyObject *unicode)
782 {
783 if (PyUnicode_CheckExact(unicode)) {
784 return Py_NewRef(unicode);
785 }
786 else
787 /* Subtype -- return genuine unicode string with the same value. */
788 return _PyUnicode_Copy(unicode);
789 }
790
791 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)794 backslashreplace(_PyBytesWriter *writer, char *str,
795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796 {
797 Py_ssize_t size, i;
798 Py_UCS4 ch;
799 int kind;
800 const void *data;
801
802 kind = PyUnicode_KIND(unicode);
803 data = PyUnicode_DATA(unicode);
804
805 size = 0;
806 /* determine replacement size */
807 for (i = collstart; i < collend; ++i) {
808 Py_ssize_t incr;
809
810 ch = PyUnicode_READ(kind, data, i);
811 if (ch < 0x100)
812 incr = 2+2;
813 else if (ch < 0x10000)
814 incr = 2+4;
815 else {
816 assert(ch <= MAX_UNICODE);
817 incr = 2+8;
818 }
819 if (size > PY_SSIZE_T_MAX - incr) {
820 PyErr_SetString(PyExc_OverflowError,
821 "encoded result is too long for a Python string");
822 return NULL;
823 }
824 size += incr;
825 }
826
827 str = _PyBytesWriter_Prepare(writer, str, size);
828 if (str == NULL)
829 return NULL;
830
831 /* generate replacement */
832 for (i = collstart; i < collend; ++i) {
833 ch = PyUnicode_READ(kind, data, i);
834 *str++ = '\\';
835 if (ch >= 0x00010000) {
836 *str++ = 'U';
837 *str++ = Py_hexdigits[(ch>>28)&0xf];
838 *str++ = Py_hexdigits[(ch>>24)&0xf];
839 *str++ = Py_hexdigits[(ch>>20)&0xf];
840 *str++ = Py_hexdigits[(ch>>16)&0xf];
841 *str++ = Py_hexdigits[(ch>>12)&0xf];
842 *str++ = Py_hexdigits[(ch>>8)&0xf];
843 }
844 else if (ch >= 0x100) {
845 *str++ = 'u';
846 *str++ = Py_hexdigits[(ch>>12)&0xf];
847 *str++ = Py_hexdigits[(ch>>8)&0xf];
848 }
849 else
850 *str++ = 'x';
851 *str++ = Py_hexdigits[(ch>>4)&0xf];
852 *str++ = Py_hexdigits[ch&0xf];
853 }
854 return str;
855 }
856
857 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
858 ASCII, Latin1, UTF-8, etc. */
859 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)860 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
861 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
862 {
863 Py_ssize_t size, i;
864 Py_UCS4 ch;
865 int kind;
866 const void *data;
867
868 kind = PyUnicode_KIND(unicode);
869 data = PyUnicode_DATA(unicode);
870
871 size = 0;
872 /* determine replacement size */
873 for (i = collstart; i < collend; ++i) {
874 Py_ssize_t incr;
875
876 ch = PyUnicode_READ(kind, data, i);
877 if (ch < 10)
878 incr = 2+1+1;
879 else if (ch < 100)
880 incr = 2+2+1;
881 else if (ch < 1000)
882 incr = 2+3+1;
883 else if (ch < 10000)
884 incr = 2+4+1;
885 else if (ch < 100000)
886 incr = 2+5+1;
887 else if (ch < 1000000)
888 incr = 2+6+1;
889 else {
890 assert(ch <= MAX_UNICODE);
891 incr = 2+7+1;
892 }
893 if (size > PY_SSIZE_T_MAX - incr) {
894 PyErr_SetString(PyExc_OverflowError,
895 "encoded result is too long for a Python string");
896 return NULL;
897 }
898 size += incr;
899 }
900
901 str = _PyBytesWriter_Prepare(writer, str, size);
902 if (str == NULL)
903 return NULL;
904
905 /* generate replacement */
906 for (i = collstart; i < collend; ++i) {
907 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
908 if (size < 0) {
909 return NULL;
910 }
911 str += size;
912 }
913 return str;
914 }
915
916 /* --- Bloom Filters ----------------------------------------------------- */
917
918 /* stuff to implement simple "bloom filters" for Unicode characters.
919 to keep things simple, we use a single bitmask, using the least 5
920 bits from each unicode characters as the bit index. */
921
922 /* the linebreak mask is set up by _PyUnicode_Init() below */
923
924 #if LONG_BIT >= 128
925 #define BLOOM_WIDTH 128
926 #elif LONG_BIT >= 64
927 #define BLOOM_WIDTH 64
928 #elif LONG_BIT >= 32
929 #define BLOOM_WIDTH 32
930 #else
931 #error "LONG_BIT is smaller than 32"
932 #endif
933
934 #define BLOOM_MASK unsigned long
935
936 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
937
938 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
939
940 #define BLOOM_LINEBREAK(ch) \
941 ((ch) < 128U ? ascii_linebreak[(ch)] : \
942 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
943
944 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)945 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
946 {
947 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
948 do { \
949 TYPE *data = (TYPE *)PTR; \
950 TYPE *end = data + LEN; \
951 Py_UCS4 ch; \
952 for (; data != end; data++) { \
953 ch = *data; \
954 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
955 } \
956 break; \
957 } while (0)
958
959 /* calculate simple bloom-style bitmask for a given unicode string */
960
961 BLOOM_MASK mask;
962
963 mask = 0;
964 switch (kind) {
965 case PyUnicode_1BYTE_KIND:
966 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
967 break;
968 case PyUnicode_2BYTE_KIND:
969 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
970 break;
971 case PyUnicode_4BYTE_KIND:
972 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
973 break;
974 default:
975 Py_UNREACHABLE();
976 }
977 return mask;
978
979 #undef BLOOM_UPDATE
980 }
981
982 static int
ensure_unicode(PyObject * obj)983 ensure_unicode(PyObject *obj)
984 {
985 if (!PyUnicode_Check(obj)) {
986 PyErr_Format(PyExc_TypeError,
987 "must be str, not %.100s",
988 Py_TYPE(obj)->tp_name);
989 return -1;
990 }
991 return 0;
992 }
993
994 /* Compilation of templated routines */
995
996 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
997
998 #include "stringlib/asciilib.h"
999 #include "stringlib/fastsearch.h"
1000 #include "stringlib/partition.h"
1001 #include "stringlib/split.h"
1002 #include "stringlib/count.h"
1003 #include "stringlib/find.h"
1004 #include "stringlib/find_max_char.h"
1005 #include "stringlib/undef.h"
1006
1007 #include "stringlib/ucs1lib.h"
1008 #include "stringlib/fastsearch.h"
1009 #include "stringlib/partition.h"
1010 #include "stringlib/split.h"
1011 #include "stringlib/count.h"
1012 #include "stringlib/find.h"
1013 #include "stringlib/replace.h"
1014 #include "stringlib/find_max_char.h"
1015 #include "stringlib/undef.h"
1016
1017 #include "stringlib/ucs2lib.h"
1018 #include "stringlib/fastsearch.h"
1019 #include "stringlib/partition.h"
1020 #include "stringlib/split.h"
1021 #include "stringlib/count.h"
1022 #include "stringlib/find.h"
1023 #include "stringlib/replace.h"
1024 #include "stringlib/find_max_char.h"
1025 #include "stringlib/undef.h"
1026
1027 #include "stringlib/ucs4lib.h"
1028 #include "stringlib/fastsearch.h"
1029 #include "stringlib/partition.h"
1030 #include "stringlib/split.h"
1031 #include "stringlib/count.h"
1032 #include "stringlib/find.h"
1033 #include "stringlib/replace.h"
1034 #include "stringlib/find_max_char.h"
1035 #include "stringlib/undef.h"
1036
1037 #undef STRINGLIB_GET_EMPTY
1038
1039 /* --- Unicode Object ----------------------------------------------------- */
1040
1041 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)1042 findchar(const void *s, int kind,
1043 Py_ssize_t size, Py_UCS4 ch,
1044 int direction)
1045 {
1046 switch (kind) {
1047 case PyUnicode_1BYTE_KIND:
1048 if ((Py_UCS1) ch != ch)
1049 return -1;
1050 if (direction > 0)
1051 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1052 else
1053 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1054 case PyUnicode_2BYTE_KIND:
1055 if ((Py_UCS2) ch != ch)
1056 return -1;
1057 if (direction > 0)
1058 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1059 else
1060 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1061 case PyUnicode_4BYTE_KIND:
1062 if (direction > 0)
1063 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1064 else
1065 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1066 default:
1067 Py_UNREACHABLE();
1068 }
1069 }
1070
1071 #ifdef Py_DEBUG
1072 /* Fill the data of a Unicode string with invalid characters to detect bugs
1073 earlier.
1074
1075 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1076 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1077 invalid character in Unicode 6.0. */
1078 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1079 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1080 {
1081 int kind = PyUnicode_KIND(unicode);
1082 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1083 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1084 if (length <= old_length)
1085 return;
1086 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1087 }
1088 #endif
1089
1090 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1091 resize_compact(PyObject *unicode, Py_ssize_t length)
1092 {
1093 Py_ssize_t char_size;
1094 Py_ssize_t struct_size;
1095 Py_ssize_t new_size;
1096 PyObject *new_unicode;
1097 #ifdef Py_DEBUG
1098 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1099 #endif
1100
1101 assert(unicode_modifiable(unicode));
1102 assert(PyUnicode_IS_COMPACT(unicode));
1103
1104 char_size = PyUnicode_KIND(unicode);
1105 if (PyUnicode_IS_ASCII(unicode))
1106 struct_size = sizeof(PyASCIIObject);
1107 else
1108 struct_size = sizeof(PyCompactUnicodeObject);
1109
1110 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1111 PyErr_NoMemory();
1112 return NULL;
1113 }
1114 new_size = (struct_size + (length + 1) * char_size);
1115
1116 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1117 PyMem_Free(_PyUnicode_UTF8(unicode));
1118 _PyUnicode_UTF8(unicode) = NULL;
1119 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120 }
1121 #ifdef Py_TRACE_REFS
1122 _Py_ForgetReference(unicode);
1123 #endif
1124
1125 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1126 if (new_unicode == NULL) {
1127 _Py_NewReferenceNoTotal(unicode);
1128 PyErr_NoMemory();
1129 return NULL;
1130 }
1131 unicode = new_unicode;
1132 _Py_NewReferenceNoTotal(unicode);
1133
1134 _PyUnicode_LENGTH(unicode) = length;
1135 #ifdef Py_DEBUG
1136 unicode_fill_invalid(unicode, old_length);
1137 #endif
1138 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1139 length, 0);
1140 assert(_PyUnicode_CheckConsistency(unicode, 0));
1141 return unicode;
1142 }
1143
1144 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1145 resize_inplace(PyObject *unicode, Py_ssize_t length)
1146 {
1147 assert(!PyUnicode_IS_COMPACT(unicode));
1148 assert(Py_REFCNT(unicode) == 1);
1149
1150 Py_ssize_t new_size;
1151 Py_ssize_t char_size;
1152 int share_utf8;
1153 void *data;
1154 #ifdef Py_DEBUG
1155 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1156 #endif
1157
1158 data = _PyUnicode_DATA_ANY(unicode);
1159 char_size = PyUnicode_KIND(unicode);
1160 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1161
1162 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1163 PyErr_NoMemory();
1164 return -1;
1165 }
1166 new_size = (length + 1) * char_size;
1167
1168 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1169 {
1170 PyMem_Free(_PyUnicode_UTF8(unicode));
1171 _PyUnicode_UTF8(unicode) = NULL;
1172 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1173 }
1174
1175 data = (PyObject *)PyObject_Realloc(data, new_size);
1176 if (data == NULL) {
1177 PyErr_NoMemory();
1178 return -1;
1179 }
1180 _PyUnicode_DATA_ANY(unicode) = data;
1181 if (share_utf8) {
1182 _PyUnicode_UTF8(unicode) = data;
1183 _PyUnicode_UTF8_LENGTH(unicode) = length;
1184 }
1185 _PyUnicode_LENGTH(unicode) = length;
1186 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1187 #ifdef Py_DEBUG
1188 unicode_fill_invalid(unicode, old_length);
1189 #endif
1190
1191 /* check for integer overflow */
1192 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1193 PyErr_NoMemory();
1194 return -1;
1195 }
1196 assert(_PyUnicode_CheckConsistency(unicode, 0));
1197 return 0;
1198 }
1199
1200 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1201 resize_copy(PyObject *unicode, Py_ssize_t length)
1202 {
1203 Py_ssize_t copy_length;
1204 PyObject *copy;
1205
1206 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1207 if (copy == NULL)
1208 return NULL;
1209
1210 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1211 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1212 return copy;
1213 }
1214
1215 static const char*
unicode_kind_name(PyObject * unicode)1216 unicode_kind_name(PyObject *unicode)
1217 {
1218 /* don't check consistency: unicode_kind_name() is called from
1219 _PyUnicode_Dump() */
1220 if (!PyUnicode_IS_COMPACT(unicode))
1221 {
1222 switch (PyUnicode_KIND(unicode))
1223 {
1224 case PyUnicode_1BYTE_KIND:
1225 if (PyUnicode_IS_ASCII(unicode))
1226 return "legacy ascii";
1227 else
1228 return "legacy latin1";
1229 case PyUnicode_2BYTE_KIND:
1230 return "legacy UCS2";
1231 case PyUnicode_4BYTE_KIND:
1232 return "legacy UCS4";
1233 default:
1234 return "<legacy invalid kind>";
1235 }
1236 }
1237 switch (PyUnicode_KIND(unicode)) {
1238 case PyUnicode_1BYTE_KIND:
1239 if (PyUnicode_IS_ASCII(unicode))
1240 return "ascii";
1241 else
1242 return "latin1";
1243 case PyUnicode_2BYTE_KIND:
1244 return "UCS2";
1245 case PyUnicode_4BYTE_KIND:
1246 return "UCS4";
1247 default:
1248 return "<invalid compact kind>";
1249 }
1250 }
1251
1252 #ifdef Py_DEBUG
1253 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1254 const char *_PyUnicode_utf8(void *unicode_raw){
1255 PyObject *unicode = _PyObject_CAST(unicode_raw);
1256 return PyUnicode_UTF8(unicode);
1257 }
1258
_PyUnicode_compact_data(void * unicode_raw)1259 const void *_PyUnicode_compact_data(void *unicode_raw) {
1260 PyObject *unicode = _PyObject_CAST(unicode_raw);
1261 return _PyUnicode_COMPACT_DATA(unicode);
1262 }
_PyUnicode_data(void * unicode_raw)1263 const void *_PyUnicode_data(void *unicode_raw) {
1264 PyObject *unicode = _PyObject_CAST(unicode_raw);
1265 printf("obj %p\n", (void*)unicode);
1266 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1267 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1268 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1269 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1270 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1271 return PyUnicode_DATA(unicode);
1272 }
1273
1274 void
_PyUnicode_Dump(PyObject * op)1275 _PyUnicode_Dump(PyObject *op)
1276 {
1277 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1278 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1279 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1280 const void *data;
1281
1282 if (ascii->state.compact)
1283 {
1284 if (ascii->state.ascii)
1285 data = (ascii + 1);
1286 else
1287 data = (compact + 1);
1288 }
1289 else
1290 data = unicode->data.any;
1291 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1292
1293 if (!ascii->state.ascii) {
1294 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1295 }
1296 printf(", data=%p\n", data);
1297 }
1298 #endif
1299
1300
1301 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1302 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1303 {
1304 /* Optimization for empty strings */
1305 if (size == 0) {
1306 return unicode_get_empty();
1307 }
1308
1309 PyObject *obj;
1310 PyCompactUnicodeObject *unicode;
1311 void *data;
1312 int kind;
1313 int is_ascii;
1314 Py_ssize_t char_size;
1315 Py_ssize_t struct_size;
1316
1317 is_ascii = 0;
1318 struct_size = sizeof(PyCompactUnicodeObject);
1319 if (maxchar < 128) {
1320 kind = PyUnicode_1BYTE_KIND;
1321 char_size = 1;
1322 is_ascii = 1;
1323 struct_size = sizeof(PyASCIIObject);
1324 }
1325 else if (maxchar < 256) {
1326 kind = PyUnicode_1BYTE_KIND;
1327 char_size = 1;
1328 }
1329 else if (maxchar < 65536) {
1330 kind = PyUnicode_2BYTE_KIND;
1331 char_size = 2;
1332 }
1333 else {
1334 if (maxchar > MAX_UNICODE) {
1335 PyErr_SetString(PyExc_SystemError,
1336 "invalid maximum character passed to PyUnicode_New");
1337 return NULL;
1338 }
1339 kind = PyUnicode_4BYTE_KIND;
1340 char_size = 4;
1341 }
1342
1343 /* Ensure we won't overflow the size. */
1344 if (size < 0) {
1345 PyErr_SetString(PyExc_SystemError,
1346 "Negative size passed to PyUnicode_New");
1347 return NULL;
1348 }
1349 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1350 return PyErr_NoMemory();
1351
1352 /* Duplicated allocation code from _PyObject_New() instead of a call to
1353 * PyObject_New() so we are able to allocate space for the object and
1354 * it's data buffer.
1355 */
1356 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1357 if (obj == NULL) {
1358 return PyErr_NoMemory();
1359 }
1360 _PyObject_Init(obj, &PyUnicode_Type);
1361
1362 unicode = (PyCompactUnicodeObject *)obj;
1363 if (is_ascii)
1364 data = ((PyASCIIObject*)obj) + 1;
1365 else
1366 data = unicode + 1;
1367 _PyUnicode_LENGTH(unicode) = size;
1368 _PyUnicode_HASH(unicode) = -1;
1369 _PyUnicode_STATE(unicode).interned = 0;
1370 _PyUnicode_STATE(unicode).kind = kind;
1371 _PyUnicode_STATE(unicode).compact = 1;
1372 _PyUnicode_STATE(unicode).ascii = is_ascii;
1373 _PyUnicode_STATE(unicode).statically_allocated = 0;
1374 if (is_ascii) {
1375 ((char*)data)[size] = 0;
1376 }
1377 else if (kind == PyUnicode_1BYTE_KIND) {
1378 ((char*)data)[size] = 0;
1379 unicode->utf8 = NULL;
1380 unicode->utf8_length = 0;
1381 }
1382 else {
1383 unicode->utf8 = NULL;
1384 unicode->utf8_length = 0;
1385 if (kind == PyUnicode_2BYTE_KIND)
1386 ((Py_UCS2*)data)[size] = 0;
1387 else /* kind == PyUnicode_4BYTE_KIND */
1388 ((Py_UCS4*)data)[size] = 0;
1389 }
1390 #ifdef Py_DEBUG
1391 unicode_fill_invalid((PyObject*)unicode, 0);
1392 #endif
1393 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1394 return obj;
1395 }
1396
1397 #if SIZEOF_WCHAR_T == 2
1398 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1399 will decode surrogate pairs, the other conversions are implemented as macros
1400 for efficiency.
1401
1402 This function assumes that unicode can hold one more code point than wstr
1403 characters for a terminating null character. */
1404 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1405 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1406 PyObject *unicode)
1407 {
1408 const wchar_t *iter;
1409 Py_UCS4 *ucs4_out;
1410
1411 assert(unicode != NULL);
1412 assert(_PyUnicode_CHECK(unicode));
1413 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1414 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1415
1416 for (iter = begin; iter < end; ) {
1417 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1418 _PyUnicode_GET_LENGTH(unicode)));
1419 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1420 && (iter+1) < end
1421 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1422 {
1423 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1424 iter += 2;
1425 }
1426 else {
1427 *ucs4_out++ = *iter;
1428 iter++;
1429 }
1430 }
1431 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1432 _PyUnicode_GET_LENGTH(unicode)));
1433
1434 }
1435 #endif
1436
1437 static int
unicode_check_modifiable(PyObject * unicode)1438 unicode_check_modifiable(PyObject *unicode)
1439 {
1440 if (!unicode_modifiable(unicode)) {
1441 PyErr_SetString(PyExc_SystemError,
1442 "Cannot modify a string currently used");
1443 return -1;
1444 }
1445 return 0;
1446 }
1447
1448 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1449 _copy_characters(PyObject *to, Py_ssize_t to_start,
1450 PyObject *from, Py_ssize_t from_start,
1451 Py_ssize_t how_many, int check_maxchar)
1452 {
1453 int from_kind, to_kind;
1454 const void *from_data;
1455 void *to_data;
1456
1457 assert(0 <= how_many);
1458 assert(0 <= from_start);
1459 assert(0 <= to_start);
1460 assert(PyUnicode_Check(from));
1461 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1462
1463 assert(PyUnicode_Check(to));
1464 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1465
1466 if (how_many == 0)
1467 return 0;
1468
1469 from_kind = PyUnicode_KIND(from);
1470 from_data = PyUnicode_DATA(from);
1471 to_kind = PyUnicode_KIND(to);
1472 to_data = PyUnicode_DATA(to);
1473
1474 #ifdef Py_DEBUG
1475 if (!check_maxchar
1476 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1477 {
1478 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1479 Py_UCS4 ch;
1480 Py_ssize_t i;
1481 for (i=0; i < how_many; i++) {
1482 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1483 assert(ch <= to_maxchar);
1484 }
1485 }
1486 #endif
1487
1488 if (from_kind == to_kind) {
1489 if (check_maxchar
1490 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1491 {
1492 /* Writing Latin-1 characters into an ASCII string requires to
1493 check that all written characters are pure ASCII */
1494 Py_UCS4 max_char;
1495 max_char = ucs1lib_find_max_char(from_data,
1496 (const Py_UCS1*)from_data + how_many);
1497 if (max_char >= 128)
1498 return -1;
1499 }
1500 memcpy((char*)to_data + to_kind * to_start,
1501 (const char*)from_data + from_kind * from_start,
1502 to_kind * how_many);
1503 }
1504 else if (from_kind == PyUnicode_1BYTE_KIND
1505 && to_kind == PyUnicode_2BYTE_KIND)
1506 {
1507 _PyUnicode_CONVERT_BYTES(
1508 Py_UCS1, Py_UCS2,
1509 PyUnicode_1BYTE_DATA(from) + from_start,
1510 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1511 PyUnicode_2BYTE_DATA(to) + to_start
1512 );
1513 }
1514 else if (from_kind == PyUnicode_1BYTE_KIND
1515 && to_kind == PyUnicode_4BYTE_KIND)
1516 {
1517 _PyUnicode_CONVERT_BYTES(
1518 Py_UCS1, Py_UCS4,
1519 PyUnicode_1BYTE_DATA(from) + from_start,
1520 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1521 PyUnicode_4BYTE_DATA(to) + to_start
1522 );
1523 }
1524 else if (from_kind == PyUnicode_2BYTE_KIND
1525 && to_kind == PyUnicode_4BYTE_KIND)
1526 {
1527 _PyUnicode_CONVERT_BYTES(
1528 Py_UCS2, Py_UCS4,
1529 PyUnicode_2BYTE_DATA(from) + from_start,
1530 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1531 PyUnicode_4BYTE_DATA(to) + to_start
1532 );
1533 }
1534 else {
1535 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1536
1537 if (!check_maxchar) {
1538 if (from_kind == PyUnicode_2BYTE_KIND
1539 && to_kind == PyUnicode_1BYTE_KIND)
1540 {
1541 _PyUnicode_CONVERT_BYTES(
1542 Py_UCS2, Py_UCS1,
1543 PyUnicode_2BYTE_DATA(from) + from_start,
1544 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1545 PyUnicode_1BYTE_DATA(to) + to_start
1546 );
1547 }
1548 else if (from_kind == PyUnicode_4BYTE_KIND
1549 && to_kind == PyUnicode_1BYTE_KIND)
1550 {
1551 _PyUnicode_CONVERT_BYTES(
1552 Py_UCS4, Py_UCS1,
1553 PyUnicode_4BYTE_DATA(from) + from_start,
1554 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1555 PyUnicode_1BYTE_DATA(to) + to_start
1556 );
1557 }
1558 else if (from_kind == PyUnicode_4BYTE_KIND
1559 && to_kind == PyUnicode_2BYTE_KIND)
1560 {
1561 _PyUnicode_CONVERT_BYTES(
1562 Py_UCS4, Py_UCS2,
1563 PyUnicode_4BYTE_DATA(from) + from_start,
1564 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1565 PyUnicode_2BYTE_DATA(to) + to_start
1566 );
1567 }
1568 else {
1569 Py_UNREACHABLE();
1570 }
1571 }
1572 else {
1573 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1574 Py_UCS4 ch;
1575 Py_ssize_t i;
1576
1577 for (i=0; i < how_many; i++) {
1578 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1579 if (ch > to_maxchar)
1580 return -1;
1581 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1582 }
1583 }
1584 }
1585 return 0;
1586 }
1587
1588 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1589 _PyUnicode_FastCopyCharacters(
1590 PyObject *to, Py_ssize_t to_start,
1591 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1592 {
1593 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1594 }
1595
1596 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1597 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1598 PyObject *from, Py_ssize_t from_start,
1599 Py_ssize_t how_many)
1600 {
1601 int err;
1602
1603 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1604 PyErr_BadInternalCall();
1605 return -1;
1606 }
1607
1608 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1609 PyErr_SetString(PyExc_IndexError, "string index out of range");
1610 return -1;
1611 }
1612 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1613 PyErr_SetString(PyExc_IndexError, "string index out of range");
1614 return -1;
1615 }
1616 if (how_many < 0) {
1617 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1618 return -1;
1619 }
1620 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1621 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1622 PyErr_Format(PyExc_SystemError,
1623 "Cannot write %zi characters at %zi "
1624 "in a string of %zi characters",
1625 how_many, to_start, PyUnicode_GET_LENGTH(to));
1626 return -1;
1627 }
1628
1629 if (how_many == 0)
1630 return 0;
1631
1632 if (unicode_check_modifiable(to))
1633 return -1;
1634
1635 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1636 if (err) {
1637 PyErr_Format(PyExc_SystemError,
1638 "Cannot copy %s characters "
1639 "into a string of %s characters",
1640 unicode_kind_name(from),
1641 unicode_kind_name(to));
1642 return -1;
1643 }
1644 return how_many;
1645 }
1646
1647 /* Find the maximum code point and count the number of surrogate pairs so a
1648 correct string length can be computed before converting a string to UCS4.
1649 This function counts single surrogates as a character and not as a pair.
1650
1651 Return 0 on success, or -1 on error. */
1652 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1653 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1654 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1655 {
1656 const wchar_t *iter;
1657 Py_UCS4 ch;
1658
1659 assert(num_surrogates != NULL && maxchar != NULL);
1660 *num_surrogates = 0;
1661 *maxchar = 0;
1662
1663 for (iter = begin; iter < end; ) {
1664 #if SIZEOF_WCHAR_T == 2
1665 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1666 && (iter+1) < end
1667 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1668 {
1669 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1670 ++(*num_surrogates);
1671 iter += 2;
1672 }
1673 else
1674 #endif
1675 {
1676 ch = *iter;
1677 iter++;
1678 }
1679 if (ch > *maxchar) {
1680 *maxchar = ch;
1681 if (*maxchar > MAX_UNICODE) {
1682 PyErr_Format(PyExc_ValueError,
1683 "character U+%x is not in range [U+0000; U+%x]",
1684 ch, MAX_UNICODE);
1685 return -1;
1686 }
1687 }
1688 }
1689 return 0;
1690 }
1691
1692 static void
unicode_dealloc(PyObject * unicode)1693 unicode_dealloc(PyObject *unicode)
1694 {
1695 #ifdef Py_DEBUG
1696 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1697 _Py_FatalRefcountError("deallocating an Unicode singleton");
1698 }
1699 #endif
1700 if (_PyUnicode_STATE(unicode).statically_allocated) {
1701 /* This should never get called, but we also don't want to SEGV if
1702 * we accidentally decref an immortal string out of existence. Since
1703 * the string is an immortal object, just re-set the reference count.
1704 */
1705 #ifdef Py_DEBUG
1706 Py_UNREACHABLE();
1707 #endif
1708 _Py_SetImmortal(unicode);
1709 return;
1710 }
1711 switch (_PyUnicode_STATE(unicode).interned) {
1712 case SSTATE_NOT_INTERNED:
1713 break;
1714 case SSTATE_INTERNED_MORTAL:
1715 /* Remove the object from the intern dict.
1716 * Before doing so, we set the refcount to 2: the key and value
1717 * in the interned_dict.
1718 */
1719 assert(Py_REFCNT(unicode) == 0);
1720 Py_SET_REFCNT(unicode, 2);
1721 #ifdef Py_REF_DEBUG
1722 /* let's be pedantic with the ref total */
1723 _Py_IncRefTotal(_PyThreadState_GET());
1724 _Py_IncRefTotal(_PyThreadState_GET());
1725 #endif
1726 PyInterpreterState *interp = _PyInterpreterState_GET();
1727 PyObject *interned = get_interned_dict(interp);
1728 assert(interned != NULL);
1729 PyObject *popped;
1730 int r = PyDict_Pop(interned, unicode, &popped);
1731 if (r == -1) {
1732 PyErr_WriteUnraisable(unicode);
1733 // We don't know what happened to the string. It's probably
1734 // best to leak it:
1735 // - if it was popped, there are no more references to it
1736 // so it can't cause trouble (except wasted memory)
1737 // - if it wasn't popped, it'll remain interned
1738 _Py_SetImmortal(unicode);
1739 _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1740 return;
1741 }
1742 if (r == 0) {
1743 // The interned string was not found in the interned_dict.
1744 #ifdef Py_DEBUG
1745 Py_UNREACHABLE();
1746 #endif
1747 _Py_SetImmortal(unicode);
1748 return;
1749 }
1750 // Successfully popped.
1751 assert(popped == unicode);
1752 // Only our `popped` reference should be left; remove it too.
1753 assert(Py_REFCNT(unicode) == 1);
1754 Py_SET_REFCNT(unicode, 0);
1755 #ifdef Py_REF_DEBUG
1756 /* let's be pedantic with the ref total */
1757 _Py_DecRefTotal(_PyThreadState_GET());
1758 #endif
1759 break;
1760 default:
1761 // As with `statically_allocated` above.
1762 #ifdef Py_REF_DEBUG
1763 Py_UNREACHABLE();
1764 #endif
1765 _Py_SetImmortal(unicode);
1766 return;
1767 }
1768 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1769 PyMem_Free(_PyUnicode_UTF8(unicode));
1770 }
1771 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1772 PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1773 }
1774
1775 Py_TYPE(unicode)->tp_free(unicode);
1776 }
1777
1778 #ifdef Py_DEBUG
1779 static int
unicode_is_singleton(PyObject * unicode)1780 unicode_is_singleton(PyObject *unicode)
1781 {
1782 if (unicode == &_Py_STR(empty)) {
1783 return 1;
1784 }
1785
1786 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1787 if (ascii->length == 1) {
1788 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1789 if (ch < 256 && LATIN1(ch) == unicode) {
1790 return 1;
1791 }
1792 }
1793 return 0;
1794 }
1795 #endif
1796
1797 static int
unicode_modifiable(PyObject * unicode)1798 unicode_modifiable(PyObject *unicode)
1799 {
1800 assert(_PyUnicode_CHECK(unicode));
1801 if (Py_REFCNT(unicode) != 1)
1802 return 0;
1803 if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1)
1804 return 0;
1805 if (PyUnicode_CHECK_INTERNED(unicode))
1806 return 0;
1807 if (!PyUnicode_CheckExact(unicode))
1808 return 0;
1809 #ifdef Py_DEBUG
1810 /* singleton refcount is greater than 1 */
1811 assert(!unicode_is_singleton(unicode));
1812 #endif
1813 return 1;
1814 }
1815
1816 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1817 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1818 {
1819 PyObject *unicode;
1820 Py_ssize_t old_length;
1821
1822 assert(p_unicode != NULL);
1823 unicode = *p_unicode;
1824
1825 assert(unicode != NULL);
1826 assert(PyUnicode_Check(unicode));
1827 assert(0 <= length);
1828
1829 old_length = PyUnicode_GET_LENGTH(unicode);
1830 if (old_length == length)
1831 return 0;
1832
1833 if (length == 0) {
1834 PyObject *empty = unicode_get_empty();
1835 Py_SETREF(*p_unicode, empty);
1836 return 0;
1837 }
1838
1839 if (!unicode_modifiable(unicode)) {
1840 PyObject *copy = resize_copy(unicode, length);
1841 if (copy == NULL)
1842 return -1;
1843 Py_SETREF(*p_unicode, copy);
1844 return 0;
1845 }
1846
1847 if (PyUnicode_IS_COMPACT(unicode)) {
1848 PyObject *new_unicode = resize_compact(unicode, length);
1849 if (new_unicode == NULL)
1850 return -1;
1851 *p_unicode = new_unicode;
1852 return 0;
1853 }
1854 return resize_inplace(unicode, length);
1855 }
1856
1857 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1858 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1859 {
1860 PyObject *unicode;
1861 if (p_unicode == NULL) {
1862 PyErr_BadInternalCall();
1863 return -1;
1864 }
1865 unicode = *p_unicode;
1866 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1867 {
1868 PyErr_BadInternalCall();
1869 return -1;
1870 }
1871 return unicode_resize(p_unicode, length);
1872 }
1873
1874 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1875
1876 WARNING: The function doesn't copy the terminating null character and
1877 doesn't check the maximum character (may write a latin1 character in an
1878 ASCII string). */
1879 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1880 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1881 const char *str, Py_ssize_t len)
1882 {
1883 int kind = PyUnicode_KIND(unicode);
1884 const void *data = PyUnicode_DATA(unicode);
1885 const char *end = str + len;
1886
1887 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1888 switch (kind) {
1889 case PyUnicode_1BYTE_KIND: {
1890 #ifdef Py_DEBUG
1891 if (PyUnicode_IS_ASCII(unicode)) {
1892 Py_UCS4 maxchar = ucs1lib_find_max_char(
1893 (const Py_UCS1*)str,
1894 (const Py_UCS1*)str + len);
1895 assert(maxchar < 128);
1896 }
1897 #endif
1898 memcpy((char *) data + index, str, len);
1899 break;
1900 }
1901 case PyUnicode_2BYTE_KIND: {
1902 Py_UCS2 *start = (Py_UCS2 *)data + index;
1903 Py_UCS2 *ucs2 = start;
1904
1905 for (; str < end; ++ucs2, ++str)
1906 *ucs2 = (Py_UCS2)*str;
1907
1908 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1909 break;
1910 }
1911 case PyUnicode_4BYTE_KIND: {
1912 Py_UCS4 *start = (Py_UCS4 *)data + index;
1913 Py_UCS4 *ucs4 = start;
1914
1915 for (; str < end; ++ucs4, ++str)
1916 *ucs4 = (Py_UCS4)*str;
1917
1918 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1919 break;
1920 }
1921 default:
1922 Py_UNREACHABLE();
1923 }
1924 }
1925
1926 static PyObject*
get_latin1_char(Py_UCS1 ch)1927 get_latin1_char(Py_UCS1 ch)
1928 {
1929 PyObject *o = LATIN1(ch);
1930 return o;
1931 }
1932
1933 static PyObject*
unicode_char(Py_UCS4 ch)1934 unicode_char(Py_UCS4 ch)
1935 {
1936 PyObject *unicode;
1937
1938 assert(ch <= MAX_UNICODE);
1939
1940 if (ch < 256) {
1941 return get_latin1_char(ch);
1942 }
1943
1944 unicode = PyUnicode_New(1, ch);
1945 if (unicode == NULL)
1946 return NULL;
1947
1948 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1949 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1950 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1951 } else {
1952 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1953 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1954 }
1955 assert(_PyUnicode_CheckConsistency(unicode, 1));
1956 return unicode;
1957 }
1958
1959 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)1960 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1961 {
1962 PyObject *unicode;
1963 Py_UCS4 maxchar = 0;
1964 Py_ssize_t num_surrogates;
1965
1966 if (u == NULL && size != 0) {
1967 PyErr_BadInternalCall();
1968 return NULL;
1969 }
1970
1971 if (size == -1) {
1972 size = wcslen(u);
1973 }
1974
1975 /* If the Unicode data is known at construction time, we can apply
1976 some optimizations which share commonly used objects. */
1977
1978 /* Optimization for empty strings */
1979 if (size == 0)
1980 _Py_RETURN_UNICODE_EMPTY();
1981
1982 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1983 /* Oracle Solaris uses non-Unicode internal wchar_t form for
1984 non-Unicode locales and hence needs conversion to UCS-4 first. */
1985 if (_Py_LocaleUsesNonUnicodeWchar()) {
1986 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1987 if (!converted) {
1988 return NULL;
1989 }
1990 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1991 PyMem_Free(converted);
1992 return unicode;
1993 }
1994 #endif
1995
1996 /* Single character Unicode objects in the Latin-1 range are
1997 shared when using this constructor */
1998 if (size == 1 && (Py_UCS4)*u < 256)
1999 return get_latin1_char((unsigned char)*u);
2000
2001 /* If not empty and not single character, copy the Unicode data
2002 into the new object */
2003 if (find_maxchar_surrogates(u, u + size,
2004 &maxchar, &num_surrogates) == -1)
2005 return NULL;
2006
2007 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2008 if (!unicode)
2009 return NULL;
2010
2011 switch (PyUnicode_KIND(unicode)) {
2012 case PyUnicode_1BYTE_KIND:
2013 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
2014 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2015 break;
2016 case PyUnicode_2BYTE_KIND:
2017 #if Py_UNICODE_SIZE == 2
2018 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2019 #else
2020 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
2021 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2022 #endif
2023 break;
2024 case PyUnicode_4BYTE_KIND:
2025 #if SIZEOF_WCHAR_T == 2
2026 /* This is the only case which has to process surrogates, thus
2027 a simple copy loop is not enough and we need a function. */
2028 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2029 #else
2030 assert(num_surrogates == 0);
2031 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2032 #endif
2033 break;
2034 default:
2035 Py_UNREACHABLE();
2036 }
2037
2038 return unicode_result(unicode);
2039 }
2040
2041 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2042 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2043 {
2044 if (size < 0) {
2045 PyErr_SetString(PyExc_SystemError,
2046 "Negative size passed to PyUnicode_FromStringAndSize");
2047 return NULL;
2048 }
2049 if (u != NULL) {
2050 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2051 }
2052 if (size > 0) {
2053 PyErr_SetString(PyExc_SystemError,
2054 "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2055 return NULL;
2056 }
2057 return unicode_get_empty();
2058 }
2059
2060 PyObject *
PyUnicode_FromString(const char * u)2061 PyUnicode_FromString(const char *u)
2062 {
2063 size_t size = strlen(u);
2064 if (size > PY_SSIZE_T_MAX) {
2065 PyErr_SetString(PyExc_OverflowError, "input too long");
2066 return NULL;
2067 }
2068 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2069 }
2070
2071
2072 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2073 _PyUnicode_FromId(_Py_Identifier *id)
2074 {
2075 PyMutex_Lock((PyMutex *)&id->mutex);
2076 PyInterpreterState *interp = _PyInterpreterState_GET();
2077 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2078
2079 Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2080 if (index < 0) {
2081 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2082
2083 PyMutex_Lock(&rt_ids->mutex);
2084 // Check again to detect concurrent access. Another thread can have
2085 // initialized the index while this thread waited for the lock.
2086 index = _Py_atomic_load_ssize(&id->index);
2087 if (index < 0) {
2088 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2089 index = rt_ids->next_index;
2090 rt_ids->next_index++;
2091 _Py_atomic_store_ssize(&id->index, index);
2092 }
2093 PyMutex_Unlock(&rt_ids->mutex);
2094 }
2095 assert(index >= 0);
2096
2097 PyObject *obj;
2098 if (index < ids->size) {
2099 obj = ids->array[index];
2100 if (obj) {
2101 // Return a borrowed reference
2102 goto end;
2103 }
2104 }
2105
2106 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2107 NULL, NULL);
2108 if (!obj) {
2109 goto end;
2110 }
2111 _PyUnicode_InternImmortal(interp, &obj);
2112
2113 if (index >= ids->size) {
2114 // Overallocate to reduce the number of realloc
2115 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2116 Py_ssize_t item_size = sizeof(ids->array[0]);
2117 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2118 if (new_array == NULL) {
2119 PyErr_NoMemory();
2120 obj = NULL;
2121 goto end;
2122 }
2123 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2124 ids->array = new_array;
2125 ids->size = new_size;
2126 }
2127
2128 // The array stores a strong reference
2129 ids->array[index] = obj;
2130
2131 end:
2132 PyMutex_Unlock((PyMutex *)&id->mutex);
2133 // Return a borrowed reference
2134 return obj;
2135 }
2136
2137
2138 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2139 unicode_clear_identifiers(struct _Py_unicode_state *state)
2140 {
2141 struct _Py_unicode_ids *ids = &state->ids;
2142 for (Py_ssize_t i=0; i < ids->size; i++) {
2143 Py_XDECREF(ids->array[i]);
2144 }
2145 ids->size = 0;
2146 PyMem_Free(ids->array);
2147 ids->array = NULL;
2148 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2149 // after Py_Finalize().
2150 }
2151
2152
2153 /* Internal function, doesn't check maximum character */
2154
2155 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2156 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2157 {
2158 const unsigned char *s = (const unsigned char *)buffer;
2159 PyObject *unicode;
2160 if (size == 1) {
2161 #ifdef Py_DEBUG
2162 assert((unsigned char)s[0] < 128);
2163 #endif
2164 return get_latin1_char(s[0]);
2165 }
2166 unicode = PyUnicode_New(size, 127);
2167 if (!unicode)
2168 return NULL;
2169 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2170 assert(_PyUnicode_CheckConsistency(unicode, 1));
2171 return unicode;
2172 }
2173
2174 static Py_UCS4
kind_maxchar_limit(int kind)2175 kind_maxchar_limit(int kind)
2176 {
2177 switch (kind) {
2178 case PyUnicode_1BYTE_KIND:
2179 return 0x80;
2180 case PyUnicode_2BYTE_KIND:
2181 return 0x100;
2182 case PyUnicode_4BYTE_KIND:
2183 return 0x10000;
2184 default:
2185 Py_UNREACHABLE();
2186 }
2187 }
2188
2189 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2190 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2191 {
2192 PyObject *res;
2193 unsigned char max_char;
2194
2195 if (size == 0) {
2196 _Py_RETURN_UNICODE_EMPTY();
2197 }
2198 assert(size > 0);
2199 if (size == 1) {
2200 return get_latin1_char(u[0]);
2201 }
2202
2203 max_char = ucs1lib_find_max_char(u, u + size);
2204 res = PyUnicode_New(size, max_char);
2205 if (!res)
2206 return NULL;
2207 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2208 assert(_PyUnicode_CheckConsistency(res, 1));
2209 return res;
2210 }
2211
2212 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2213 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2214 {
2215 PyObject *res;
2216 Py_UCS2 max_char;
2217
2218 if (size == 0)
2219 _Py_RETURN_UNICODE_EMPTY();
2220 assert(size > 0);
2221 if (size == 1)
2222 return unicode_char(u[0]);
2223
2224 max_char = ucs2lib_find_max_char(u, u + size);
2225 res = PyUnicode_New(size, max_char);
2226 if (!res)
2227 return NULL;
2228 if (max_char >= 256)
2229 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2230 else {
2231 _PyUnicode_CONVERT_BYTES(
2232 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233 }
2234 assert(_PyUnicode_CheckConsistency(res, 1));
2235 return res;
2236 }
2237
2238 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2239 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2240 {
2241 PyObject *res;
2242 Py_UCS4 max_char;
2243
2244 if (size == 0)
2245 _Py_RETURN_UNICODE_EMPTY();
2246 assert(size > 0);
2247 if (size == 1)
2248 return unicode_char(u[0]);
2249
2250 max_char = ucs4lib_find_max_char(u, u + size);
2251 res = PyUnicode_New(size, max_char);
2252 if (!res)
2253 return NULL;
2254 if (max_char < 256)
2255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256 PyUnicode_1BYTE_DATA(res));
2257 else if (max_char < 0x10000)
2258 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259 PyUnicode_2BYTE_DATA(res));
2260 else
2261 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2262 assert(_PyUnicode_CheckConsistency(res, 1));
2263 return res;
2264 }
2265
2266 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2267 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268 {
2269 if (size < 0) {
2270 PyErr_SetString(PyExc_ValueError, "size must be positive");
2271 return NULL;
2272 }
2273 switch (kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return _PyUnicode_FromUCS1(buffer, size);
2276 case PyUnicode_2BYTE_KIND:
2277 return _PyUnicode_FromUCS2(buffer, size);
2278 case PyUnicode_4BYTE_KIND:
2279 return _PyUnicode_FromUCS4(buffer, size);
2280 default:
2281 PyErr_SetString(PyExc_SystemError, "invalid kind");
2282 return NULL;
2283 }
2284 }
2285
2286 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2287 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288 {
2289 int kind;
2290 const void *startptr, *endptr;
2291
2292 assert(0 <= start);
2293 assert(end <= PyUnicode_GET_LENGTH(unicode));
2294 assert(start <= end);
2295
2296 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2297 return PyUnicode_MAX_CHAR_VALUE(unicode);
2298
2299 if (start == end)
2300 return 127;
2301
2302 if (PyUnicode_IS_ASCII(unicode))
2303 return 127;
2304
2305 kind = PyUnicode_KIND(unicode);
2306 startptr = PyUnicode_DATA(unicode);
2307 endptr = (char *)startptr + end * kind;
2308 startptr = (char *)startptr + start * kind;
2309 switch(kind) {
2310 case PyUnicode_1BYTE_KIND:
2311 return ucs1lib_find_max_char(startptr, endptr);
2312 case PyUnicode_2BYTE_KIND:
2313 return ucs2lib_find_max_char(startptr, endptr);
2314 case PyUnicode_4BYTE_KIND:
2315 return ucs4lib_find_max_char(startptr, endptr);
2316 default:
2317 Py_UNREACHABLE();
2318 }
2319 }
2320
2321 /* Ensure that a string uses the most efficient storage, if it is not the
2322 case: create a new string with of the right kind. Write NULL into *p_unicode
2323 on error. */
2324 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2325 unicode_adjust_maxchar(PyObject **p_unicode)
2326 {
2327 PyObject *unicode, *copy;
2328 Py_UCS4 max_char;
2329 Py_ssize_t len;
2330 int kind;
2331
2332 assert(p_unicode != NULL);
2333 unicode = *p_unicode;
2334 if (PyUnicode_IS_ASCII(unicode))
2335 return;
2336
2337 len = PyUnicode_GET_LENGTH(unicode);
2338 kind = PyUnicode_KIND(unicode);
2339 if (kind == PyUnicode_1BYTE_KIND) {
2340 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2341 max_char = ucs1lib_find_max_char(u, u + len);
2342 if (max_char >= 128)
2343 return;
2344 }
2345 else if (kind == PyUnicode_2BYTE_KIND) {
2346 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2347 max_char = ucs2lib_find_max_char(u, u + len);
2348 if (max_char >= 256)
2349 return;
2350 }
2351 else if (kind == PyUnicode_4BYTE_KIND) {
2352 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2353 max_char = ucs4lib_find_max_char(u, u + len);
2354 if (max_char >= 0x10000)
2355 return;
2356 }
2357 else
2358 Py_UNREACHABLE();
2359
2360 copy = PyUnicode_New(len, max_char);
2361 if (copy != NULL)
2362 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2363 Py_DECREF(unicode);
2364 *p_unicode = copy;
2365 }
2366
2367 PyObject*
_PyUnicode_Copy(PyObject * unicode)2368 _PyUnicode_Copy(PyObject *unicode)
2369 {
2370 Py_ssize_t length;
2371 PyObject *copy;
2372
2373 if (!PyUnicode_Check(unicode)) {
2374 PyErr_BadInternalCall();
2375 return NULL;
2376 }
2377
2378 length = PyUnicode_GET_LENGTH(unicode);
2379 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2380 if (!copy)
2381 return NULL;
2382 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2383
2384 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2385 length * PyUnicode_KIND(unicode));
2386 assert(_PyUnicode_CheckConsistency(copy, 1));
2387 return copy;
2388 }
2389
2390
2391 /* Widen Unicode objects to larger buffers. Don't write terminating null
2392 character. Return NULL on error. */
2393
2394 static void*
unicode_askind(int skind,void const * data,Py_ssize_t len,int kind)2395 unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2396 {
2397 void *result;
2398
2399 assert(skind < kind);
2400 switch (kind) {
2401 case PyUnicode_2BYTE_KIND:
2402 result = PyMem_New(Py_UCS2, len);
2403 if (!result)
2404 return PyErr_NoMemory();
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS2,
2408 (const Py_UCS1 *)data,
2409 ((const Py_UCS1 *)data) + len,
2410 result);
2411 return result;
2412 case PyUnicode_4BYTE_KIND:
2413 result = PyMem_New(Py_UCS4, len);
2414 if (!result)
2415 return PyErr_NoMemory();
2416 if (skind == PyUnicode_2BYTE_KIND) {
2417 _PyUnicode_CONVERT_BYTES(
2418 Py_UCS2, Py_UCS4,
2419 (const Py_UCS2 *)data,
2420 ((const Py_UCS2 *)data) + len,
2421 result);
2422 }
2423 else {
2424 assert(skind == PyUnicode_1BYTE_KIND);
2425 _PyUnicode_CONVERT_BYTES(
2426 Py_UCS1, Py_UCS4,
2427 (const Py_UCS1 *)data,
2428 ((const Py_UCS1 *)data) + len,
2429 result);
2430 }
2431 return result;
2432 default:
2433 Py_UNREACHABLE();
2434 return NULL;
2435 }
2436 }
2437
2438 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2439 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2440 int copy_null)
2441 {
2442 int kind;
2443 const void *data;
2444 Py_ssize_t len, targetlen;
2445 kind = PyUnicode_KIND(string);
2446 data = PyUnicode_DATA(string);
2447 len = PyUnicode_GET_LENGTH(string);
2448 targetlen = len;
2449 if (copy_null)
2450 targetlen++;
2451 if (!target) {
2452 target = PyMem_New(Py_UCS4, targetlen);
2453 if (!target) {
2454 PyErr_NoMemory();
2455 return NULL;
2456 }
2457 }
2458 else {
2459 if (targetsize < targetlen) {
2460 PyErr_Format(PyExc_SystemError,
2461 "string is longer than the buffer");
2462 if (copy_null && 0 < targetsize)
2463 target[0] = 0;
2464 return NULL;
2465 }
2466 }
2467 if (kind == PyUnicode_1BYTE_KIND) {
2468 const Py_UCS1 *start = (const Py_UCS1 *) data;
2469 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2470 }
2471 else if (kind == PyUnicode_2BYTE_KIND) {
2472 const Py_UCS2 *start = (const Py_UCS2 *) data;
2473 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2474 }
2475 else if (kind == PyUnicode_4BYTE_KIND) {
2476 memcpy(target, data, len * sizeof(Py_UCS4));
2477 }
2478 else {
2479 Py_UNREACHABLE();
2480 }
2481 if (copy_null)
2482 target[len] = 0;
2483 return target;
2484 }
2485
2486 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2487 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2488 int copy_null)
2489 {
2490 if (target == NULL || targetsize < 0) {
2491 PyErr_BadInternalCall();
2492 return NULL;
2493 }
2494 return as_ucs4(string, target, targetsize, copy_null);
2495 }
2496
2497 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2498 PyUnicode_AsUCS4Copy(PyObject *string)
2499 {
2500 return as_ucs4(string, NULL, 0, 1);
2501 }
2502
2503 /* maximum number of characters required for output of %jo or %jd or %p.
2504 We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2505 plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2506 plus 1 for the terminal NUL. */
2507 #define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2508
2509 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision,int flags)2510 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2511 Py_ssize_t width, Py_ssize_t precision, int flags)
2512 {
2513 Py_ssize_t length, fill, arglen;
2514 Py_UCS4 maxchar;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 fill = Py_MAX(width - length, 0);
2534 if (fill && !(flags & F_LJUST)) {
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543
2544 if (fill && (flags & F_LJUST)) {
2545 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2546 return -1;
2547 writer->pos += fill;
2548 }
2549
2550 return 0;
2551 }
2552
2553 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision,int flags)2554 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2555 Py_ssize_t width, Py_ssize_t precision, int flags)
2556 {
2557 /* UTF-8 */
2558 Py_ssize_t length;
2559 PyObject *unicode;
2560 int res;
2561
2562 if (precision == -1) {
2563 length = strlen(str);
2564 }
2565 else {
2566 length = 0;
2567 while (length < precision && str[length]) {
2568 length++;
2569 }
2570 }
2571 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2572 if (unicode == NULL)
2573 return -1;
2574
2575 res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2576 Py_DECREF(unicode);
2577 return res;
2578 }
2579
2580 static int
unicode_fromformat_write_wcstr(_PyUnicodeWriter * writer,const wchar_t * str,Py_ssize_t width,Py_ssize_t precision,int flags)2581 unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2582 Py_ssize_t width, Py_ssize_t precision, int flags)
2583 {
2584 /* UTF-8 */
2585 Py_ssize_t length;
2586 PyObject *unicode;
2587 int res;
2588
2589 if (precision == -1) {
2590 length = wcslen(str);
2591 }
2592 else {
2593 length = 0;
2594 while (length < precision && str[length]) {
2595 length++;
2596 }
2597 }
2598 unicode = PyUnicode_FromWideChar(str, length);
2599 if (unicode == NULL)
2600 return -1;
2601
2602 res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2603 Py_DECREF(unicode);
2604 return res;
2605 }
2606
2607 #define F_LONG 1
2608 #define F_LONGLONG 2
2609 #define F_SIZE 3
2610 #define F_PTRDIFF 4
2611 #define F_INTMAX 5
2612 static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
2613 static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
2614 static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
2615 static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
2616 static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
2617
2618 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2619 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2620 const char *f, va_list *vargs)
2621 {
2622 const char *p;
2623 Py_ssize_t len;
2624 int flags = 0;
2625 Py_ssize_t width;
2626 Py_ssize_t precision;
2627
2628 p = f;
2629 f++;
2630 if (*f == '%') {
2631 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2632 return NULL;
2633 f++;
2634 return f;
2635 }
2636
2637 /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2638 /* Flags '+', ' ' and '#' are not particularly useful.
2639 * They are not worth the implementation and maintenance costs.
2640 * In addition, '#' should add "0" for "o" conversions for compatibility
2641 * with printf, but it would confuse Python users. */
2642 while (1) {
2643 switch (*f++) {
2644 case '-': flags |= F_LJUST; continue;
2645 case '0': flags |= F_ZERO; continue;
2646 case '#': flags |= F_ALT; continue;
2647 }
2648 f--;
2649 break;
2650 }
2651
2652 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2653 width = -1;
2654 if (*f == '*') {
2655 width = va_arg(*vargs, int);
2656 if (width < 0) {
2657 flags |= F_LJUST;
2658 width = -width;
2659 }
2660 f++;
2661 }
2662 else if (Py_ISDIGIT((unsigned)*f)) {
2663 width = *f - '0';
2664 f++;
2665 while (Py_ISDIGIT((unsigned)*f)) {
2666 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2667 PyErr_SetString(PyExc_ValueError,
2668 "width too big");
2669 return NULL;
2670 }
2671 width = (width * 10) + (*f - '0');
2672 f++;
2673 }
2674 }
2675 precision = -1;
2676 if (*f == '.') {
2677 f++;
2678 if (*f == '*') {
2679 precision = va_arg(*vargs, int);
2680 if (precision < 0) {
2681 precision = -2;
2682 }
2683 f++;
2684 }
2685 else if (Py_ISDIGIT((unsigned)*f)) {
2686 precision = (*f - '0');
2687 f++;
2688 while (Py_ISDIGIT((unsigned)*f)) {
2689 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2690 PyErr_SetString(PyExc_ValueError,
2691 "precision too big");
2692 return NULL;
2693 }
2694 precision = (precision * 10) + (*f - '0');
2695 f++;
2696 }
2697 }
2698 }
2699
2700 int sizemod = 0;
2701 if (*f == 'l') {
2702 if (f[1] == 'l') {
2703 sizemod = F_LONGLONG;
2704 f += 2;
2705 }
2706 else {
2707 sizemod = F_LONG;
2708 ++f;
2709 }
2710 }
2711 else if (*f == 'z') {
2712 sizemod = F_SIZE;
2713 ++f;
2714 }
2715 else if (*f == 't') {
2716 sizemod = F_PTRDIFF;
2717 ++f;
2718 }
2719 else if (*f == 'j') {
2720 sizemod = F_INTMAX;
2721 ++f;
2722 }
2723 if (f[0] != '\0' && f[1] == '\0')
2724 writer->overallocate = 0;
2725
2726 switch (*f) {
2727 case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2728 break;
2729 case 'c': case 'p':
2730 if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2731 break;
2732 case 's':
2733 case 'V':
2734 if (sizemod && sizemod != F_LONG) goto invalid_format;
2735 break;
2736 default:
2737 if (sizemod) goto invalid_format;
2738 break;
2739 }
2740
2741 switch (*f) {
2742 case 'c':
2743 {
2744 int ordinal = va_arg(*vargs, int);
2745 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2746 PyErr_SetString(PyExc_OverflowError,
2747 "character argument not in range(0x110000)");
2748 return NULL;
2749 }
2750 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2751 return NULL;
2752 break;
2753 }
2754
2755 case 'd': case 'i':
2756 case 'o': case 'u': case 'x': case 'X':
2757 {
2758 /* used by sprintf */
2759 char buffer[MAX_INTMAX_CHARS];
2760 const char *fmt = NULL;
2761 switch (*f) {
2762 case 'o': fmt = formats_o[sizemod]; break;
2763 case 'u': fmt = formats_u[sizemod]; break;
2764 case 'x': fmt = formats_x[sizemod]; break;
2765 case 'X': fmt = formats_X[sizemod]; break;
2766 default: fmt = formats[sizemod]; break;
2767 }
2768 int issigned = (*f == 'd' || *f == 'i');
2769 switch (sizemod) {
2770 case F_LONG:
2771 len = issigned ?
2772 sprintf(buffer, fmt, va_arg(*vargs, long)) :
2773 sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
2774 break;
2775 case F_LONGLONG:
2776 len = issigned ?
2777 sprintf(buffer, fmt, va_arg(*vargs, long long)) :
2778 sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
2779 break;
2780 case F_SIZE:
2781 len = issigned ?
2782 sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
2783 sprintf(buffer, fmt, va_arg(*vargs, size_t));
2784 break;
2785 case F_PTRDIFF:
2786 len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
2787 break;
2788 case F_INTMAX:
2789 len = issigned ?
2790 sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
2791 sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
2792 break;
2793 default:
2794 len = issigned ?
2795 sprintf(buffer, fmt, va_arg(*vargs, int)) :
2796 sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
2797 break;
2798 }
2799 assert(len >= 0);
2800
2801 int sign = (buffer[0] == '-');
2802 len -= sign;
2803
2804 precision = Py_MAX(precision, len);
2805 width = Py_MAX(width, precision + sign);
2806 if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2807 precision = width - sign;
2808 }
2809
2810 Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2811 Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2812
2813 if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2814 return NULL;
2815
2816 if (spacepad && !(flags & F_LJUST)) {
2817 if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2818 return NULL;
2819 writer->pos += spacepad;
2820 }
2821
2822 if (sign) {
2823 if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2824 return NULL;
2825 }
2826
2827 if (zeropad) {
2828 if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2829 return NULL;
2830 writer->pos += zeropad;
2831 }
2832
2833 if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2834 return NULL;
2835
2836 if (spacepad && (flags & F_LJUST)) {
2837 if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2838 return NULL;
2839 writer->pos += spacepad;
2840 }
2841 break;
2842 }
2843
2844 case 'p':
2845 {
2846 char number[MAX_INTMAX_CHARS];
2847
2848 len = sprintf(number, "%p", va_arg(*vargs, void*));
2849 assert(len >= 0);
2850
2851 /* %p is ill-defined: ensure leading 0x. */
2852 if (number[1] == 'X')
2853 number[1] = 'x';
2854 else if (number[1] != 'x') {
2855 memmove(number + 2, number,
2856 strlen(number) + 1);
2857 number[0] = '0';
2858 number[1] = 'x';
2859 len += 2;
2860 }
2861
2862 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2863 return NULL;
2864 break;
2865 }
2866
2867 case 's':
2868 {
2869 if (sizemod) {
2870 const wchar_t *s = va_arg(*vargs, const wchar_t*);
2871 if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2872 return NULL;
2873 }
2874 else {
2875 /* UTF-8 */
2876 const char *s = va_arg(*vargs, const char*);
2877 if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
2878 return NULL;
2879 }
2880 break;
2881 }
2882
2883 case 'U':
2884 {
2885 PyObject *obj = va_arg(*vargs, PyObject *);
2886 assert(obj && _PyUnicode_CHECK(obj));
2887
2888 if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2889 return NULL;
2890 break;
2891 }
2892
2893 case 'V':
2894 {
2895 PyObject *obj = va_arg(*vargs, PyObject *);
2896 const char *str;
2897 const wchar_t *wstr;
2898 if (sizemod) {
2899 wstr = va_arg(*vargs, const wchar_t*);
2900 }
2901 else {
2902 str = va_arg(*vargs, const char *);
2903 }
2904 if (obj) {
2905 assert(_PyUnicode_CHECK(obj));
2906 if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2907 return NULL;
2908 }
2909 else if (sizemod) {
2910 assert(wstr != NULL);
2911 if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2912 return NULL;
2913 }
2914 else {
2915 assert(str != NULL);
2916 if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
2917 return NULL;
2918 }
2919 break;
2920 }
2921
2922 case 'S':
2923 {
2924 PyObject *obj = va_arg(*vargs, PyObject *);
2925 PyObject *str;
2926 assert(obj);
2927 str = PyObject_Str(obj);
2928 if (!str)
2929 return NULL;
2930 if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2931 Py_DECREF(str);
2932 return NULL;
2933 }
2934 Py_DECREF(str);
2935 break;
2936 }
2937
2938 case 'R':
2939 {
2940 PyObject *obj = va_arg(*vargs, PyObject *);
2941 PyObject *repr;
2942 assert(obj);
2943 repr = PyObject_Repr(obj);
2944 if (!repr)
2945 return NULL;
2946 if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2947 Py_DECREF(repr);
2948 return NULL;
2949 }
2950 Py_DECREF(repr);
2951 break;
2952 }
2953
2954 case 'A':
2955 {
2956 PyObject *obj = va_arg(*vargs, PyObject *);
2957 PyObject *ascii;
2958 assert(obj);
2959 ascii = PyObject_ASCII(obj);
2960 if (!ascii)
2961 return NULL;
2962 if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2963 Py_DECREF(ascii);
2964 return NULL;
2965 }
2966 Py_DECREF(ascii);
2967 break;
2968 }
2969
2970 case 'T':
2971 {
2972 PyObject *obj = va_arg(*vargs, PyObject *);
2973 PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2974
2975 PyObject *type_name;
2976 if (flags & F_ALT) {
2977 type_name = _PyType_GetFullyQualifiedName(type, ':');
2978 }
2979 else {
2980 type_name = PyType_GetFullyQualifiedName(type);
2981 }
2982 Py_DECREF(type);
2983 if (!type_name) {
2984 return NULL;
2985 }
2986
2987 if (unicode_fromformat_write_str(writer, type_name,
2988 width, precision, flags) == -1) {
2989 Py_DECREF(type_name);
2990 return NULL;
2991 }
2992 Py_DECREF(type_name);
2993 break;
2994 }
2995
2996 case 'N':
2997 {
2998 PyObject *type_raw = va_arg(*vargs, PyObject *);
2999 assert(type_raw != NULL);
3000
3001 if (!PyType_Check(type_raw)) {
3002 PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3003 return NULL;
3004 }
3005 PyTypeObject *type = (PyTypeObject*)type_raw;
3006
3007 PyObject *type_name;
3008 if (flags & F_ALT) {
3009 type_name = _PyType_GetFullyQualifiedName(type, ':');
3010 }
3011 else {
3012 type_name = PyType_GetFullyQualifiedName(type);
3013 }
3014 if (!type_name) {
3015 return NULL;
3016 }
3017 if (unicode_fromformat_write_str(writer, type_name,
3018 width, precision, flags) == -1) {
3019 Py_DECREF(type_name);
3020 return NULL;
3021 }
3022 Py_DECREF(type_name);
3023 break;
3024 }
3025
3026 default:
3027 invalid_format:
3028 PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3029 return NULL;
3030 }
3031
3032 f++;
3033 return f;
3034 }
3035
3036 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3037 PyUnicode_FromFormatV(const char *format, va_list vargs)
3038 {
3039 va_list vargs2;
3040 const char *f;
3041 _PyUnicodeWriter writer;
3042
3043 _PyUnicodeWriter_Init(&writer);
3044 writer.min_length = strlen(format) + 100;
3045 writer.overallocate = 1;
3046
3047 // Copy varags to be able to pass a reference to a subfunction.
3048 va_copy(vargs2, vargs);
3049
3050 for (f = format; *f; ) {
3051 if (*f == '%') {
3052 f = unicode_fromformat_arg(&writer, f, &vargs2);
3053 if (f == NULL)
3054 goto fail;
3055 }
3056 else {
3057 const char *p;
3058 Py_ssize_t len;
3059
3060 p = f;
3061 do
3062 {
3063 if ((unsigned char)*p > 127) {
3064 PyErr_Format(PyExc_ValueError,
3065 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066 "string, got a non-ASCII byte: 0x%02x",
3067 (unsigned char)*p);
3068 goto fail;
3069 }
3070 p++;
3071 }
3072 while (*p != '\0' && *p != '%');
3073 len = p - f;
3074
3075 if (*p == '\0')
3076 writer.overallocate = 0;
3077
3078 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3079 goto fail;
3080
3081 f = p;
3082 }
3083 }
3084 va_end(vargs2);
3085 return _PyUnicodeWriter_Finish(&writer);
3086
3087 fail:
3088 va_end(vargs2);
3089 _PyUnicodeWriter_Dealloc(&writer);
3090 return NULL;
3091 }
3092
3093 PyObject *
PyUnicode_FromFormat(const char * format,...)3094 PyUnicode_FromFormat(const char *format, ...)
3095 {
3096 PyObject* ret;
3097 va_list vargs;
3098
3099 va_start(vargs, format);
3100 ret = PyUnicode_FromFormatV(format, vargs);
3101 va_end(vargs);
3102 return ret;
3103 }
3104
3105 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3106 unicode_get_widechar_size(PyObject *unicode)
3107 {
3108 Py_ssize_t res;
3109
3110 assert(unicode != NULL);
3111 assert(_PyUnicode_CHECK(unicode));
3112
3113 res = _PyUnicode_LENGTH(unicode);
3114 #if SIZEOF_WCHAR_T == 2
3115 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3116 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3117 const Py_UCS4 *end = s + res;
3118 for (; s < end; ++s) {
3119 if (*s > 0xFFFF) {
3120 ++res;
3121 }
3122 }
3123 }
3124 #endif
3125 return res;
3126 }
3127
3128 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3129 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3130 {
3131 assert(unicode != NULL);
3132 assert(_PyUnicode_CHECK(unicode));
3133
3134 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3135 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3136 return;
3137 }
3138
3139 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3140 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3141 for (; size--; ++s, ++w) {
3142 *w = *s;
3143 }
3144 }
3145 else {
3146 #if SIZEOF_WCHAR_T == 4
3147 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3148 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3149 for (; size--; ++s, ++w) {
3150 *w = *s;
3151 }
3152 #else
3153 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3154 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3155 for (; size--; ++s, ++w) {
3156 Py_UCS4 ch = *s;
3157 if (ch > 0xFFFF) {
3158 assert(ch <= MAX_UNICODE);
3159 /* encode surrogate pair in this case */
3160 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3161 if (!size--)
3162 break;
3163 *w = Py_UNICODE_LOW_SURROGATE(ch);
3164 }
3165 else {
3166 *w = ch;
3167 }
3168 }
3169 #endif
3170 }
3171 }
3172
3173 #ifdef HAVE_WCHAR_H
3174
3175 /* Convert a Unicode object to a wide character string.
3176
3177 - If w is NULL: return the number of wide characters (including the null
3178 character) required to convert the unicode object. Ignore size argument.
3179
3180 - Otherwise: return the number of wide characters (excluding the null
3181 character) written into w. Write at most size wide characters (including
3182 the null character). */
3183 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3184 PyUnicode_AsWideChar(PyObject *unicode,
3185 wchar_t *w,
3186 Py_ssize_t size)
3187 {
3188 Py_ssize_t res;
3189
3190 if (unicode == NULL) {
3191 PyErr_BadInternalCall();
3192 return -1;
3193 }
3194 if (!PyUnicode_Check(unicode)) {
3195 PyErr_BadArgument();
3196 return -1;
3197 }
3198
3199 res = unicode_get_widechar_size(unicode);
3200 if (w == NULL) {
3201 return res + 1;
3202 }
3203
3204 if (size > res) {
3205 size = res + 1;
3206 }
3207 else {
3208 res = size;
3209 }
3210 unicode_copy_as_widechar(unicode, w, size);
3211
3212 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3213 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3214 non-Unicode locales and hence needs conversion first. */
3215 if (_Py_LocaleUsesNonUnicodeWchar()) {
3216 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3217 return -1;
3218 }
3219 }
3220 #endif
3221
3222 return res;
3223 }
3224
3225 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3226 PyUnicode_AsWideCharString(PyObject *unicode,
3227 Py_ssize_t *size)
3228 {
3229 wchar_t *buffer;
3230 Py_ssize_t buflen;
3231
3232 if (unicode == NULL) {
3233 PyErr_BadInternalCall();
3234 return NULL;
3235 }
3236 if (!PyUnicode_Check(unicode)) {
3237 PyErr_BadArgument();
3238 return NULL;
3239 }
3240
3241 buflen = unicode_get_widechar_size(unicode);
3242 buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3243 if (buffer == NULL) {
3244 PyErr_NoMemory();
3245 return NULL;
3246 }
3247 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3248
3249 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3250 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3251 non-Unicode locales and hence needs conversion first. */
3252 if (_Py_LocaleUsesNonUnicodeWchar()) {
3253 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3254 return NULL;
3255 }
3256 }
3257 #endif
3258
3259 if (size != NULL) {
3260 *size = buflen;
3261 }
3262 else if (wcslen(buffer) != (size_t)buflen) {
3263 PyMem_Free(buffer);
3264 PyErr_SetString(PyExc_ValueError,
3265 "embedded null character");
3266 return NULL;
3267 }
3268 return buffer;
3269 }
3270
3271 #endif /* HAVE_WCHAR_H */
3272
3273 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3274 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3275 {
3276 wchar_t **p = (wchar_t **)ptr;
3277 if (obj == NULL) {
3278 PyMem_Free(*p);
3279 *p = NULL;
3280 return 1;
3281 }
3282 if (PyUnicode_Check(obj)) {
3283 *p = PyUnicode_AsWideCharString(obj, NULL);
3284 if (*p == NULL) {
3285 return 0;
3286 }
3287 return Py_CLEANUP_SUPPORTED;
3288 }
3289 PyErr_Format(PyExc_TypeError,
3290 "argument must be str, not %.50s",
3291 Py_TYPE(obj)->tp_name);
3292 return 0;
3293 }
3294
3295 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3296 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3297 {
3298 wchar_t **p = (wchar_t **)ptr;
3299 if (obj == NULL) {
3300 PyMem_Free(*p);
3301 *p = NULL;
3302 return 1;
3303 }
3304 if (obj == Py_None) {
3305 *p = NULL;
3306 return 1;
3307 }
3308 if (PyUnicode_Check(obj)) {
3309 *p = PyUnicode_AsWideCharString(obj, NULL);
3310 if (*p == NULL) {
3311 return 0;
3312 }
3313 return Py_CLEANUP_SUPPORTED;
3314 }
3315 PyErr_Format(PyExc_TypeError,
3316 "argument must be str or None, not %.50s",
3317 Py_TYPE(obj)->tp_name);
3318 return 0;
3319 }
3320
3321 PyObject *
PyUnicode_FromOrdinal(int ordinal)3322 PyUnicode_FromOrdinal(int ordinal)
3323 {
3324 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3325 PyErr_SetString(PyExc_ValueError,
3326 "chr() arg not in range(0x110000)");
3327 return NULL;
3328 }
3329
3330 return unicode_char((Py_UCS4)ordinal);
3331 }
3332
3333 PyObject *
PyUnicode_FromObject(PyObject * obj)3334 PyUnicode_FromObject(PyObject *obj)
3335 {
3336 /* XXX Perhaps we should make this API an alias of
3337 PyObject_Str() instead ?! */
3338 if (PyUnicode_CheckExact(obj)) {
3339 return Py_NewRef(obj);
3340 }
3341 if (PyUnicode_Check(obj)) {
3342 /* For a Unicode subtype that's not a Unicode object,
3343 return a true Unicode object with the same data. */
3344 return _PyUnicode_Copy(obj);
3345 }
3346 PyErr_Format(PyExc_TypeError,
3347 "Can't convert '%.100s' object to str implicitly",
3348 Py_TYPE(obj)->tp_name);
3349 return NULL;
3350 }
3351
3352 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3353 PyUnicode_FromEncodedObject(PyObject *obj,
3354 const char *encoding,
3355 const char *errors)
3356 {
3357 Py_buffer buffer;
3358 PyObject *v;
3359
3360 if (obj == NULL) {
3361 PyErr_BadInternalCall();
3362 return NULL;
3363 }
3364
3365 /* Decoding bytes objects is the most common case and should be fast */
3366 if (PyBytes_Check(obj)) {
3367 if (PyBytes_GET_SIZE(obj) == 0) {
3368 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369 return NULL;
3370 }
3371 _Py_RETURN_UNICODE_EMPTY();
3372 }
3373 return PyUnicode_Decode(
3374 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3375 encoding, errors);
3376 }
3377
3378 if (PyUnicode_Check(obj)) {
3379 PyErr_SetString(PyExc_TypeError,
3380 "decoding str is not supported");
3381 return NULL;
3382 }
3383
3384 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3385 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3386 PyErr_Format(PyExc_TypeError,
3387 "decoding to str: need a bytes-like object, %.80s found",
3388 Py_TYPE(obj)->tp_name);
3389 return NULL;
3390 }
3391
3392 if (buffer.len == 0) {
3393 PyBuffer_Release(&buffer);
3394 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3395 return NULL;
3396 }
3397 _Py_RETURN_UNICODE_EMPTY();
3398 }
3399
3400 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3401 PyBuffer_Release(&buffer);
3402 return v;
3403 }
3404
3405 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3406 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3407 longer than lower_len-1). */
3408 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3409 _Py_normalize_encoding(const char *encoding,
3410 char *lower,
3411 size_t lower_len)
3412 {
3413 const char *e;
3414 char *l;
3415 char *l_end;
3416 int punct;
3417
3418 assert(encoding != NULL);
3419
3420 e = encoding;
3421 l = lower;
3422 l_end = &lower[lower_len - 1];
3423 punct = 0;
3424 while (1) {
3425 char c = *e;
3426 if (c == 0) {
3427 break;
3428 }
3429
3430 if (Py_ISALNUM(c) || c == '.') {
3431 if (punct && l != lower) {
3432 if (l == l_end) {
3433 return 0;
3434 }
3435 *l++ = '_';
3436 }
3437 punct = 0;
3438
3439 if (l == l_end) {
3440 return 0;
3441 }
3442 *l++ = Py_TOLOWER(c);
3443 }
3444 else {
3445 punct = 1;
3446 }
3447
3448 e++;
3449 }
3450 *l = '\0';
3451 return 1;
3452 }
3453
3454 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3455 PyUnicode_Decode(const char *s,
3456 Py_ssize_t size,
3457 const char *encoding,
3458 const char *errors)
3459 {
3460 PyObject *buffer = NULL, *unicode;
3461 Py_buffer info;
3462 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3463
3464 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3465 return NULL;
3466 }
3467
3468 if (size == 0) {
3469 _Py_RETURN_UNICODE_EMPTY();
3470 }
3471
3472 if (encoding == NULL) {
3473 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3474 }
3475
3476 /* Shortcuts for common default encodings */
3477 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3478 char *lower = buflower;
3479
3480 /* Fast paths */
3481 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3482 lower += 3;
3483 if (*lower == '_') {
3484 /* Match "utf8" and "utf_8" */
3485 lower++;
3486 }
3487
3488 if (lower[0] == '8' && lower[1] == 0) {
3489 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3490 }
3491 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3492 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3493 }
3494 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3496 }
3497 }
3498 else {
3499 if (strcmp(lower, "ascii") == 0
3500 || strcmp(lower, "us_ascii") == 0) {
3501 return PyUnicode_DecodeASCII(s, size, errors);
3502 }
3503 #ifdef MS_WINDOWS
3504 else if (strcmp(lower, "mbcs") == 0) {
3505 return PyUnicode_DecodeMBCS(s, size, errors);
3506 }
3507 #endif
3508 else if (strcmp(lower, "latin1") == 0
3509 || strcmp(lower, "latin_1") == 0
3510 || strcmp(lower, "iso_8859_1") == 0
3511 || strcmp(lower, "iso8859_1") == 0) {
3512 return PyUnicode_DecodeLatin1(s, size, errors);
3513 }
3514 }
3515 }
3516
3517 /* Decode via the codec registry */
3518 buffer = NULL;
3519 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3520 goto onError;
3521 buffer = PyMemoryView_FromBuffer(&info);
3522 if (buffer == NULL)
3523 goto onError;
3524 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3525 if (unicode == NULL)
3526 goto onError;
3527 if (!PyUnicode_Check(unicode)) {
3528 PyErr_Format(PyExc_TypeError,
3529 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3530 "use codecs.decode() to decode to arbitrary types",
3531 encoding,
3532 Py_TYPE(unicode)->tp_name);
3533 Py_DECREF(unicode);
3534 goto onError;
3535 }
3536 Py_DECREF(buffer);
3537 return unicode_result(unicode);
3538
3539 onError:
3540 Py_XDECREF(buffer);
3541 return NULL;
3542 }
3543
3544 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3545 PyUnicode_AsDecodedObject(PyObject *unicode,
3546 const char *encoding,
3547 const char *errors)
3548 {
3549 if (!PyUnicode_Check(unicode)) {
3550 PyErr_BadArgument();
3551 return NULL;
3552 }
3553
3554 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3555 "PyUnicode_AsDecodedObject() is deprecated; "
3556 "use PyCodec_Decode() to decode from str", 1) < 0)
3557 return NULL;
3558
3559 if (encoding == NULL)
3560 encoding = PyUnicode_GetDefaultEncoding();
3561
3562 /* Decode via the codec registry */
3563 return PyCodec_Decode(unicode, encoding, errors);
3564 }
3565
3566 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3567 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3568 const char *encoding,
3569 const char *errors)
3570 {
3571 PyObject *v;
3572
3573 if (!PyUnicode_Check(unicode)) {
3574 PyErr_BadArgument();
3575 goto onError;
3576 }
3577
3578 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3579 "PyUnicode_AsDecodedUnicode() is deprecated; "
3580 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3581 return NULL;
3582
3583 if (encoding == NULL)
3584 encoding = PyUnicode_GetDefaultEncoding();
3585
3586 /* Decode via the codec registry */
3587 v = PyCodec_Decode(unicode, encoding, errors);
3588 if (v == NULL)
3589 goto onError;
3590 if (!PyUnicode_Check(v)) {
3591 PyErr_Format(PyExc_TypeError,
3592 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3593 "use codecs.decode() to decode to arbitrary types",
3594 encoding,
3595 Py_TYPE(unicode)->tp_name);
3596 Py_DECREF(v);
3597 goto onError;
3598 }
3599 return unicode_result(v);
3600
3601 onError:
3602 return NULL;
3603 }
3604
3605 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3606 PyUnicode_AsEncodedObject(PyObject *unicode,
3607 const char *encoding,
3608 const char *errors)
3609 {
3610 PyObject *v;
3611
3612 if (!PyUnicode_Check(unicode)) {
3613 PyErr_BadArgument();
3614 goto onError;
3615 }
3616
3617 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3618 "PyUnicode_AsEncodedObject() is deprecated; "
3619 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3620 "or PyCodec_Encode() for generic encoding", 1) < 0)
3621 return NULL;
3622
3623 if (encoding == NULL)
3624 encoding = PyUnicode_GetDefaultEncoding();
3625
3626 /* Encode via the codec registry */
3627 v = PyCodec_Encode(unicode, encoding, errors);
3628 if (v == NULL)
3629 goto onError;
3630 return v;
3631
3632 onError:
3633 return NULL;
3634 }
3635
3636
3637 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3638 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3639 int current_locale)
3640 {
3641 Py_ssize_t wlen;
3642 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3643 if (wstr == NULL) {
3644 return NULL;
3645 }
3646
3647 if ((size_t)wlen != wcslen(wstr)) {
3648 PyErr_SetString(PyExc_ValueError, "embedded null character");
3649 PyMem_Free(wstr);
3650 return NULL;
3651 }
3652
3653 char *str;
3654 size_t error_pos;
3655 const char *reason;
3656 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3657 current_locale, error_handler);
3658 PyMem_Free(wstr);
3659
3660 if (res != 0) {
3661 if (res == -2) {
3662 PyObject *exc;
3663 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3664 "locale", unicode,
3665 (Py_ssize_t)error_pos,
3666 (Py_ssize_t)(error_pos+1),
3667 reason);
3668 if (exc != NULL) {
3669 PyCodec_StrictErrors(exc);
3670 Py_DECREF(exc);
3671 }
3672 }
3673 else if (res == -3) {
3674 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3675 }
3676 else {
3677 PyErr_NoMemory();
3678 }
3679 return NULL;
3680 }
3681
3682 PyObject *bytes = PyBytes_FromString(str);
3683 PyMem_RawFree(str);
3684 return bytes;
3685 }
3686
3687 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3688 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3689 {
3690 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3691 return unicode_encode_locale(unicode, error_handler, 1);
3692 }
3693
3694 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3695 PyUnicode_EncodeFSDefault(PyObject *unicode)
3696 {
3697 PyInterpreterState *interp = _PyInterpreterState_GET();
3698 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3699 if (fs_codec->utf8) {
3700 return unicode_encode_utf8(unicode,
3701 fs_codec->error_handler,
3702 fs_codec->errors);
3703 }
3704 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3705 else if (fs_codec->encoding) {
3706 return PyUnicode_AsEncodedString(unicode,
3707 fs_codec->encoding,
3708 fs_codec->errors);
3709 }
3710 #endif
3711 else {
3712 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3713 machinery is not ready and so cannot be used:
3714 use wcstombs() in this case. */
3715 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3716 const wchar_t *filesystem_errors = config->filesystem_errors;
3717 assert(filesystem_errors != NULL);
3718 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3719 assert(errors != _Py_ERROR_UNKNOWN);
3720 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3721 return unicode_encode_utf8(unicode, errors, NULL);
3722 #else
3723 return unicode_encode_locale(unicode, errors, 0);
3724 #endif
3725 }
3726 }
3727
3728 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3729 PyUnicode_AsEncodedString(PyObject *unicode,
3730 const char *encoding,
3731 const char *errors)
3732 {
3733 PyObject *v;
3734 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3735
3736 if (!PyUnicode_Check(unicode)) {
3737 PyErr_BadArgument();
3738 return NULL;
3739 }
3740
3741 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3742 return NULL;
3743 }
3744
3745 if (encoding == NULL) {
3746 return _PyUnicode_AsUTF8String(unicode, errors);
3747 }
3748
3749 /* Shortcuts for common default encodings */
3750 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3751 char *lower = buflower;
3752
3753 /* Fast paths */
3754 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3755 lower += 3;
3756 if (*lower == '_') {
3757 /* Match "utf8" and "utf_8" */
3758 lower++;
3759 }
3760
3761 if (lower[0] == '8' && lower[1] == 0) {
3762 return _PyUnicode_AsUTF8String(unicode, errors);
3763 }
3764 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3765 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3766 }
3767 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3768 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3769 }
3770 }
3771 else {
3772 if (strcmp(lower, "ascii") == 0
3773 || strcmp(lower, "us_ascii") == 0) {
3774 return _PyUnicode_AsASCIIString(unicode, errors);
3775 }
3776 #ifdef MS_WINDOWS
3777 else if (strcmp(lower, "mbcs") == 0) {
3778 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3779 }
3780 #endif
3781 else if (strcmp(lower, "latin1") == 0 ||
3782 strcmp(lower, "latin_1") == 0 ||
3783 strcmp(lower, "iso_8859_1") == 0 ||
3784 strcmp(lower, "iso8859_1") == 0) {
3785 return _PyUnicode_AsLatin1String(unicode, errors);
3786 }
3787 }
3788 }
3789
3790 /* Encode via the codec registry */
3791 v = _PyCodec_EncodeText(unicode, encoding, errors);
3792 if (v == NULL)
3793 return NULL;
3794
3795 /* The normal path */
3796 if (PyBytes_Check(v))
3797 return v;
3798
3799 /* If the codec returns a buffer, raise a warning and convert to bytes */
3800 if (PyByteArray_Check(v)) {
3801 int error;
3802 PyObject *b;
3803
3804 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3805 "encoder %s returned bytearray instead of bytes; "
3806 "use codecs.encode() to encode to arbitrary types",
3807 encoding);
3808 if (error) {
3809 Py_DECREF(v);
3810 return NULL;
3811 }
3812
3813 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3814 PyByteArray_GET_SIZE(v));
3815 Py_DECREF(v);
3816 return b;
3817 }
3818
3819 PyErr_Format(PyExc_TypeError,
3820 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3821 "use codecs.encode() to encode to arbitrary types",
3822 encoding,
3823 Py_TYPE(v)->tp_name);
3824 Py_DECREF(v);
3825 return NULL;
3826 }
3827
3828 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3829 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3830 const char *encoding,
3831 const char *errors)
3832 {
3833 PyObject *v;
3834
3835 if (!PyUnicode_Check(unicode)) {
3836 PyErr_BadArgument();
3837 goto onError;
3838 }
3839
3840 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3841 "PyUnicode_AsEncodedUnicode() is deprecated; "
3842 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3843 return NULL;
3844
3845 if (encoding == NULL)
3846 encoding = PyUnicode_GetDefaultEncoding();
3847
3848 /* Encode via the codec registry */
3849 v = PyCodec_Encode(unicode, encoding, errors);
3850 if (v == NULL)
3851 goto onError;
3852 if (!PyUnicode_Check(v)) {
3853 PyErr_Format(PyExc_TypeError,
3854 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3855 "use codecs.encode() to encode to arbitrary types",
3856 encoding,
3857 Py_TYPE(v)->tp_name);
3858 Py_DECREF(v);
3859 goto onError;
3860 }
3861 return v;
3862
3863 onError:
3864 return NULL;
3865 }
3866
3867 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3868 unicode_decode_locale(const char *str, Py_ssize_t len,
3869 _Py_error_handler errors, int current_locale)
3870 {
3871 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3872 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3873 return NULL;
3874 }
3875
3876 wchar_t *wstr;
3877 size_t wlen;
3878 const char *reason;
3879 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3880 current_locale, errors);
3881 if (res != 0) {
3882 if (res == -2) {
3883 PyObject *exc;
3884 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3885 "locale", str, len,
3886 (Py_ssize_t)wlen,
3887 (Py_ssize_t)(wlen + 1),
3888 reason);
3889 if (exc != NULL) {
3890 PyCodec_StrictErrors(exc);
3891 Py_DECREF(exc);
3892 }
3893 }
3894 else if (res == -3) {
3895 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3896 }
3897 else {
3898 PyErr_NoMemory();
3899 }
3900 return NULL;
3901 }
3902
3903 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3904 PyMem_RawFree(wstr);
3905 return unicode;
3906 }
3907
3908 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3909 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3910 const char *errors)
3911 {
3912 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3913 return unicode_decode_locale(str, len, error_handler, 1);
3914 }
3915
3916 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3917 PyUnicode_DecodeLocale(const char *str, const char *errors)
3918 {
3919 Py_ssize_t size = (Py_ssize_t)strlen(str);
3920 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3921 return unicode_decode_locale(str, size, error_handler, 1);
3922 }
3923
3924
3925 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3926 PyUnicode_DecodeFSDefault(const char *s) {
3927 Py_ssize_t size = (Py_ssize_t)strlen(s);
3928 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3929 }
3930
3931 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3932 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3933 {
3934 PyInterpreterState *interp = _PyInterpreterState_GET();
3935 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3936 if (fs_codec->utf8) {
3937 return unicode_decode_utf8(s, size,
3938 fs_codec->error_handler,
3939 fs_codec->errors,
3940 NULL);
3941 }
3942 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3943 else if (fs_codec->encoding) {
3944 return PyUnicode_Decode(s, size,
3945 fs_codec->encoding,
3946 fs_codec->errors);
3947 }
3948 #endif
3949 else {
3950 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3951 machinery is not ready and so cannot be used:
3952 use mbstowcs() in this case. */
3953 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3954 const wchar_t *filesystem_errors = config->filesystem_errors;
3955 assert(filesystem_errors != NULL);
3956 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3957 assert(errors != _Py_ERROR_UNKNOWN);
3958 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3959 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3960 #else
3961 return unicode_decode_locale(s, size, errors, 0);
3962 #endif
3963 }
3964 }
3965
3966
3967 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3968 PyUnicode_FSConverter(PyObject* arg, void* addr)
3969 {
3970 PyObject *path = NULL;
3971 PyObject *output = NULL;
3972 Py_ssize_t size;
3973 const char *data;
3974 if (arg == NULL) {
3975 Py_DECREF(*(PyObject**)addr);
3976 *(PyObject**)addr = NULL;
3977 return 1;
3978 }
3979 path = PyOS_FSPath(arg);
3980 if (path == NULL) {
3981 return 0;
3982 }
3983 if (PyBytes_Check(path)) {
3984 output = path;
3985 }
3986 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3987 output = PyUnicode_EncodeFSDefault(path);
3988 Py_DECREF(path);
3989 if (!output) {
3990 return 0;
3991 }
3992 assert(PyBytes_Check(output));
3993 }
3994
3995 size = PyBytes_GET_SIZE(output);
3996 data = PyBytes_AS_STRING(output);
3997 if ((size_t)size != strlen(data)) {
3998 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3999 Py_DECREF(output);
4000 return 0;
4001 }
4002 *(PyObject**)addr = output;
4003 return Py_CLEANUP_SUPPORTED;
4004 }
4005
4006
4007 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4008 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4009 {
4010 if (arg == NULL) {
4011 Py_DECREF(*(PyObject**)addr);
4012 *(PyObject**)addr = NULL;
4013 return 1;
4014 }
4015
4016 PyObject *path = PyOS_FSPath(arg);
4017 if (path == NULL) {
4018 return 0;
4019 }
4020
4021 PyObject *output = NULL;
4022 if (PyUnicode_Check(path)) {
4023 output = path;
4024 }
4025 else if (PyBytes_Check(path)) {
4026 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4027 PyBytes_GET_SIZE(path));
4028 Py_DECREF(path);
4029 if (!output) {
4030 return 0;
4031 }
4032 }
4033 else {
4034 PyErr_Format(PyExc_TypeError,
4035 "path should be string, bytes, or os.PathLike, not %.200s",
4036 Py_TYPE(arg)->tp_name);
4037 Py_DECREF(path);
4038 return 0;
4039 }
4040
4041 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4042 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4043 PyErr_SetString(PyExc_ValueError, "embedded null character");
4044 Py_DECREF(output);
4045 return 0;
4046 }
4047 *(PyObject**)addr = output;
4048 return Py_CLEANUP_SUPPORTED;
4049 }
4050
4051
4052 static int unicode_fill_utf8(PyObject *unicode);
4053
4054 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4055 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4056 {
4057 if (!PyUnicode_Check(unicode)) {
4058 PyErr_BadArgument();
4059 if (psize) {
4060 *psize = -1;
4061 }
4062 return NULL;
4063 }
4064
4065 if (PyUnicode_UTF8(unicode) == NULL) {
4066 if (unicode_fill_utf8(unicode) == -1) {
4067 if (psize) {
4068 *psize = -1;
4069 }
4070 return NULL;
4071 }
4072 }
4073
4074 if (psize) {
4075 *psize = PyUnicode_UTF8_LENGTH(unicode);
4076 }
4077 return PyUnicode_UTF8(unicode);
4078 }
4079
4080 const char *
PyUnicode_AsUTF8(PyObject * unicode)4081 PyUnicode_AsUTF8(PyObject *unicode)
4082 {
4083 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4084 }
4085
4086 const char *
_PyUnicode_AsUTF8NoNUL(PyObject * unicode)4087 _PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4088 {
4089 Py_ssize_t size;
4090 const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4091 if (s && strlen(s) != (size_t)size) {
4092 PyErr_SetString(PyExc_ValueError, "embedded null character");
4093 return NULL;
4094 }
4095 return s;
4096 }
4097
4098 /*
4099 PyUnicode_GetSize() has been deprecated since Python 3.3
4100 because it returned length of Py_UNICODE.
4101
4102 But this function is part of stable abi, because it doesn't
4103 include Py_UNICODE in signature and it was not excluded from
4104 stable ABI in PEP 384.
4105 */
4106 PyAPI_FUNC(Py_ssize_t)
PyUnicode_GetSize(PyObject * unicode)4107 PyUnicode_GetSize(PyObject *unicode)
4108 {
4109 PyErr_SetString(PyExc_RuntimeError,
4110 "PyUnicode_GetSize has been removed.");
4111 return -1;
4112 }
4113
4114 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4115 PyUnicode_GetLength(PyObject *unicode)
4116 {
4117 if (!PyUnicode_Check(unicode)) {
4118 PyErr_BadArgument();
4119 return -1;
4120 }
4121 return PyUnicode_GET_LENGTH(unicode);
4122 }
4123
4124 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4125 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4126 {
4127 const void *data;
4128 int kind;
4129
4130 if (!PyUnicode_Check(unicode)) {
4131 PyErr_BadArgument();
4132 return (Py_UCS4)-1;
4133 }
4134 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4135 PyErr_SetString(PyExc_IndexError, "string index out of range");
4136 return (Py_UCS4)-1;
4137 }
4138 data = PyUnicode_DATA(unicode);
4139 kind = PyUnicode_KIND(unicode);
4140 return PyUnicode_READ(kind, data, index);
4141 }
4142
4143 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4144 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4145 {
4146 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4147 PyErr_BadArgument();
4148 return -1;
4149 }
4150 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4151 PyErr_SetString(PyExc_IndexError, "string index out of range");
4152 return -1;
4153 }
4154 if (unicode_check_modifiable(unicode))
4155 return -1;
4156 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4157 PyErr_SetString(PyExc_ValueError, "character out of range");
4158 return -1;
4159 }
4160 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4161 index, ch);
4162 return 0;
4163 }
4164
4165 const char *
PyUnicode_GetDefaultEncoding(void)4166 PyUnicode_GetDefaultEncoding(void)
4167 {
4168 return "utf-8";
4169 }
4170
4171 /* create or adjust a UnicodeDecodeError */
4172 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4173 make_decode_exception(PyObject **exceptionObject,
4174 const char *encoding,
4175 const char *input, Py_ssize_t length,
4176 Py_ssize_t startpos, Py_ssize_t endpos,
4177 const char *reason)
4178 {
4179 if (*exceptionObject == NULL) {
4180 *exceptionObject = PyUnicodeDecodeError_Create(
4181 encoding, input, length, startpos, endpos, reason);
4182 }
4183 else {
4184 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4185 goto onError;
4186 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4187 goto onError;
4188 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4189 goto onError;
4190 }
4191 return;
4192
4193 onError:
4194 Py_CLEAR(*exceptionObject);
4195 }
4196
4197 #ifdef MS_WINDOWS
4198 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4199 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4200 {
4201 if (newsize > *size) {
4202 wchar_t *newbuf = *buf;
4203 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4204 PyErr_NoMemory();
4205 return -1;
4206 }
4207 *buf = newbuf;
4208 }
4209 *size = newsize;
4210 return 0;
4211 }
4212
4213 /* error handling callback helper:
4214 build arguments, call the callback and check the arguments,
4215 if no exception occurred, copy the replacement to the output
4216 and adjust various state variables.
4217 return 0 on success, -1 on error
4218 */
4219
4220 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4221 unicode_decode_call_errorhandler_wchar(
4222 const char *errors, PyObject **errorHandler,
4223 const char *encoding, const char *reason,
4224 const char **input, const char **inend, Py_ssize_t *startinpos,
4225 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4226 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4227 {
4228 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4229
4230 PyObject *restuple = NULL;
4231 PyObject *repunicode = NULL;
4232 Py_ssize_t outsize;
4233 Py_ssize_t insize;
4234 Py_ssize_t requiredsize;
4235 Py_ssize_t newpos;
4236 PyObject *inputobj = NULL;
4237 Py_ssize_t repwlen;
4238
4239 if (*errorHandler == NULL) {
4240 *errorHandler = PyCodec_LookupError(errors);
4241 if (*errorHandler == NULL)
4242 goto onError;
4243 }
4244
4245 make_decode_exception(exceptionObject,
4246 encoding,
4247 *input, *inend - *input,
4248 *startinpos, *endinpos,
4249 reason);
4250 if (*exceptionObject == NULL)
4251 goto onError;
4252
4253 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4254 if (restuple == NULL)
4255 goto onError;
4256 if (!PyTuple_Check(restuple)) {
4257 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4258 goto onError;
4259 }
4260 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4261 goto onError;
4262
4263 /* Copy back the bytes variables, which might have been modified by the
4264 callback */
4265 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4266 if (!inputobj)
4267 goto onError;
4268 *input = PyBytes_AS_STRING(inputobj);
4269 insize = PyBytes_GET_SIZE(inputobj);
4270 *inend = *input + insize;
4271 /* we can DECREF safely, as the exception has another reference,
4272 so the object won't go away. */
4273 Py_DECREF(inputobj);
4274
4275 if (newpos<0)
4276 newpos = insize+newpos;
4277 if (newpos<0 || newpos>insize) {
4278 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4279 goto onError;
4280 }
4281
4282 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4283 if (repwlen < 0)
4284 goto onError;
4285 repwlen--;
4286 /* need more space? (at least enough for what we
4287 have+the replacement+the rest of the string (starting
4288 at the new input position), so we won't have to check space
4289 when there are no errors in the rest of the string) */
4290 requiredsize = *outpos;
4291 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4292 goto overflow;
4293 requiredsize += repwlen;
4294 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4295 goto overflow;
4296 requiredsize += insize - newpos;
4297 outsize = *bufsize;
4298 if (requiredsize > outsize) {
4299 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4300 requiredsize = 2*outsize;
4301 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4302 goto onError;
4303 }
4304 }
4305 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4306 *outpos += repwlen;
4307 *endinpos = newpos;
4308 *inptr = *input + newpos;
4309
4310 /* we made it! */
4311 Py_DECREF(restuple);
4312 return 0;
4313
4314 overflow:
4315 PyErr_SetString(PyExc_OverflowError,
4316 "decoded result is too long for a Python string");
4317
4318 onError:
4319 Py_XDECREF(restuple);
4320 return -1;
4321 }
4322 #endif /* MS_WINDOWS */
4323
4324 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4325 unicode_decode_call_errorhandler_writer(
4326 const char *errors, PyObject **errorHandler,
4327 const char *encoding, const char *reason,
4328 const char **input, const char **inend, Py_ssize_t *startinpos,
4329 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4330 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4331 {
4332 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4333
4334 PyObject *restuple = NULL;
4335 PyObject *repunicode = NULL;
4336 Py_ssize_t insize;
4337 Py_ssize_t newpos;
4338 Py_ssize_t replen;
4339 Py_ssize_t remain;
4340 PyObject *inputobj = NULL;
4341 int need_to_grow = 0;
4342 const char *new_inptr;
4343
4344 if (*errorHandler == NULL) {
4345 *errorHandler = PyCodec_LookupError(errors);
4346 if (*errorHandler == NULL)
4347 goto onError;
4348 }
4349
4350 make_decode_exception(exceptionObject,
4351 encoding,
4352 *input, *inend - *input,
4353 *startinpos, *endinpos,
4354 reason);
4355 if (*exceptionObject == NULL)
4356 goto onError;
4357
4358 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4359 if (restuple == NULL)
4360 goto onError;
4361 if (!PyTuple_Check(restuple)) {
4362 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4363 goto onError;
4364 }
4365 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4366 goto onError;
4367
4368 /* Copy back the bytes variables, which might have been modified by the
4369 callback */
4370 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4371 if (!inputobj)
4372 goto onError;
4373 remain = *inend - *input - *endinpos;
4374 *input = PyBytes_AS_STRING(inputobj);
4375 insize = PyBytes_GET_SIZE(inputobj);
4376 *inend = *input + insize;
4377 /* we can DECREF safely, as the exception has another reference,
4378 so the object won't go away. */
4379 Py_DECREF(inputobj);
4380
4381 if (newpos<0)
4382 newpos = insize+newpos;
4383 if (newpos<0 || newpos>insize) {
4384 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4385 goto onError;
4386 }
4387
4388 replen = PyUnicode_GET_LENGTH(repunicode);
4389 if (replen > 1) {
4390 writer->min_length += replen - 1;
4391 need_to_grow = 1;
4392 }
4393 new_inptr = *input + newpos;
4394 if (*inend - new_inptr > remain) {
4395 /* We don't know the decoding algorithm here so we make the worst
4396 assumption that one byte decodes to one unicode character.
4397 If unfortunately one byte could decode to more unicode characters,
4398 the decoder may write out-of-bound then. Is it possible for the
4399 algorithms using this function? */
4400 writer->min_length += *inend - new_inptr - remain;
4401 need_to_grow = 1;
4402 }
4403 if (need_to_grow) {
4404 writer->overallocate = 1;
4405 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4406 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4407 goto onError;
4408 }
4409 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4410 goto onError;
4411
4412 *endinpos = newpos;
4413 *inptr = new_inptr;
4414
4415 /* we made it! */
4416 Py_DECREF(restuple);
4417 return 0;
4418
4419 onError:
4420 Py_XDECREF(restuple);
4421 return -1;
4422 }
4423
4424 /* --- UTF-7 Codec -------------------------------------------------------- */
4425
4426 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4427
4428 /* Three simple macros defining base-64. */
4429
4430 /* Is c a base-64 character? */
4431
4432 #define IS_BASE64(c) \
4433 (((c) >= 'A' && (c) <= 'Z') || \
4434 ((c) >= 'a' && (c) <= 'z') || \
4435 ((c) >= '0' && (c) <= '9') || \
4436 (c) == '+' || (c) == '/')
4437
4438 /* given that c is a base-64 character, what is its base-64 value? */
4439
4440 #define FROM_BASE64(c) \
4441 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4442 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4443 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4444 (c) == '+' ? 62 : 63)
4445
4446 /* What is the base-64 character of the bottom 6 bits of n? */
4447
4448 #define TO_BASE64(n) \
4449 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4450
4451 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4452 * decoded as itself. We are permissive on decoding; the only ASCII
4453 * byte not decoding to itself is the + which begins a base64
4454 * string. */
4455
4456 #define DECODE_DIRECT(c) \
4457 ((c) <= 127 && (c) != '+')
4458
4459 /* The UTF-7 encoder treats ASCII characters differently according to
4460 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4461 * the above). See RFC2152. This array identifies these different
4462 * sets:
4463 * 0 : "Set D"
4464 * alphanumeric and '(),-./:?
4465 * 1 : "Set O"
4466 * !"#$%&*;<=>@[]^_`{|}
4467 * 2 : "whitespace"
4468 * ht nl cr sp
4469 * 3 : special (must be base64 encoded)
4470 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4471 */
4472
4473 static
4474 char utf7_category[128] = {
4475 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4476 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4477 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4478 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4479 /* sp ! " # $ % & ' ( ) * + , - . / */
4480 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4481 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4483 /* @ A B C D E F G H I J K L M N O */
4484 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4485 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4487 /* ` a b c d e f g h i j k l m n o */
4488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4489 /* p q r s t u v w x y z { | } ~ del */
4490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4491 };
4492
4493 /* ENCODE_DIRECT: this character should be encoded as itself. The
4494 * answer depends on whether we are encoding set O as itself, and also
4495 * on whether we are encoding whitespace as itself. RFC2152 makes it
4496 * clear that the answers to these questions vary between
4497 * applications, so this code needs to be flexible. */
4498
4499 #define ENCODE_DIRECT(c, directO, directWS) \
4500 ((c) < 128 && (c) > 0 && \
4501 ((utf7_category[(c)] == 0) || \
4502 (directWS && (utf7_category[(c)] == 2)) || \
4503 (directO && (utf7_category[(c)] == 1))))
4504
4505 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4506 PyUnicode_DecodeUTF7(const char *s,
4507 Py_ssize_t size,
4508 const char *errors)
4509 {
4510 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4511 }
4512
4513 /* The decoder. The only state we preserve is our read position,
4514 * i.e. how many characters we have consumed. So if we end in the
4515 * middle of a shift sequence we have to back off the read position
4516 * and the output to the beginning of the sequence, otherwise we lose
4517 * all the shift state (seen bits, number of bits seen, high
4518 * surrogate). */
4519
4520 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4521 PyUnicode_DecodeUTF7Stateful(const char *s,
4522 Py_ssize_t size,
4523 const char *errors,
4524 Py_ssize_t *consumed)
4525 {
4526 const char *starts = s;
4527 Py_ssize_t startinpos;
4528 Py_ssize_t endinpos;
4529 const char *e;
4530 _PyUnicodeWriter writer;
4531 const char *errmsg = "";
4532 int inShift = 0;
4533 Py_ssize_t shiftOutStart;
4534 unsigned int base64bits = 0;
4535 unsigned long base64buffer = 0;
4536 Py_UCS4 surrogate = 0;
4537 PyObject *errorHandler = NULL;
4538 PyObject *exc = NULL;
4539
4540 if (size == 0) {
4541 if (consumed)
4542 *consumed = 0;
4543 _Py_RETURN_UNICODE_EMPTY();
4544 }
4545
4546 /* Start off assuming it's all ASCII. Widen later as necessary. */
4547 _PyUnicodeWriter_Init(&writer);
4548 writer.min_length = size;
4549
4550 shiftOutStart = 0;
4551 e = s + size;
4552
4553 while (s < e) {
4554 Py_UCS4 ch;
4555 restart:
4556 ch = (unsigned char) *s;
4557
4558 if (inShift) { /* in a base-64 section */
4559 if (IS_BASE64(ch)) { /* consume a base-64 character */
4560 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4561 base64bits += 6;
4562 s++;
4563 if (base64bits >= 16) {
4564 /* we have enough bits for a UTF-16 value */
4565 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4566 base64bits -= 16;
4567 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4568 assert(outCh <= 0xffff);
4569 if (surrogate) {
4570 /* expecting a second surrogate */
4571 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4572 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4573 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4574 goto onError;
4575 surrogate = 0;
4576 continue;
4577 }
4578 else {
4579 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4580 goto onError;
4581 surrogate = 0;
4582 }
4583 }
4584 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4585 /* first surrogate */
4586 surrogate = outCh;
4587 }
4588 else {
4589 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4590 goto onError;
4591 }
4592 }
4593 }
4594 else { /* now leaving a base-64 section */
4595 inShift = 0;
4596 if (base64bits > 0) { /* left-over bits */
4597 if (base64bits >= 6) {
4598 /* We've seen at least one base-64 character */
4599 s++;
4600 errmsg = "partial character in shift sequence";
4601 goto utf7Error;
4602 }
4603 else {
4604 /* Some bits remain; they should be zero */
4605 if (base64buffer != 0) {
4606 s++;
4607 errmsg = "non-zero padding bits in shift sequence";
4608 goto utf7Error;
4609 }
4610 }
4611 }
4612 if (surrogate && DECODE_DIRECT(ch)) {
4613 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4614 goto onError;
4615 }
4616 surrogate = 0;
4617 if (ch == '-') {
4618 /* '-' is absorbed; other terminating
4619 characters are preserved */
4620 s++;
4621 }
4622 }
4623 }
4624 else if ( ch == '+' ) {
4625 startinpos = s-starts;
4626 s++; /* consume '+' */
4627 if (s < e && *s == '-') { /* '+-' encodes '+' */
4628 s++;
4629 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4630 goto onError;
4631 }
4632 else if (s < e && !IS_BASE64(*s)) {
4633 s++;
4634 errmsg = "ill-formed sequence";
4635 goto utf7Error;
4636 }
4637 else { /* begin base64-encoded section */
4638 inShift = 1;
4639 surrogate = 0;
4640 shiftOutStart = writer.pos;
4641 base64bits = 0;
4642 base64buffer = 0;
4643 }
4644 }
4645 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4646 s++;
4647 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4648 goto onError;
4649 }
4650 else {
4651 startinpos = s-starts;
4652 s++;
4653 errmsg = "unexpected special character";
4654 goto utf7Error;
4655 }
4656 continue;
4657 utf7Error:
4658 endinpos = s-starts;
4659 if (unicode_decode_call_errorhandler_writer(
4660 errors, &errorHandler,
4661 "utf7", errmsg,
4662 &starts, &e, &startinpos, &endinpos, &exc, &s,
4663 &writer))
4664 goto onError;
4665 }
4666
4667 /* end of string */
4668
4669 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4670 /* if we're in an inconsistent state, that's an error */
4671 inShift = 0;
4672 if (surrogate ||
4673 (base64bits >= 6) ||
4674 (base64bits > 0 && base64buffer != 0)) {
4675 endinpos = size;
4676 if (unicode_decode_call_errorhandler_writer(
4677 errors, &errorHandler,
4678 "utf7", "unterminated shift sequence",
4679 &starts, &e, &startinpos, &endinpos, &exc, &s,
4680 &writer))
4681 goto onError;
4682 if (s < e)
4683 goto restart;
4684 }
4685 }
4686
4687 /* return state */
4688 if (consumed) {
4689 if (inShift) {
4690 *consumed = startinpos;
4691 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4692 PyObject *result = PyUnicode_FromKindAndData(
4693 writer.kind, writer.data, shiftOutStart);
4694 Py_XDECREF(errorHandler);
4695 Py_XDECREF(exc);
4696 _PyUnicodeWriter_Dealloc(&writer);
4697 return result;
4698 }
4699 writer.pos = shiftOutStart; /* back off output */
4700 }
4701 else {
4702 *consumed = s-starts;
4703 }
4704 }
4705
4706 Py_XDECREF(errorHandler);
4707 Py_XDECREF(exc);
4708 return _PyUnicodeWriter_Finish(&writer);
4709
4710 onError:
4711 Py_XDECREF(errorHandler);
4712 Py_XDECREF(exc);
4713 _PyUnicodeWriter_Dealloc(&writer);
4714 return NULL;
4715 }
4716
4717
4718 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4719 _PyUnicode_EncodeUTF7(PyObject *str,
4720 int base64SetO,
4721 int base64WhiteSpace,
4722 const char *errors)
4723 {
4724 int kind;
4725 const void *data;
4726 Py_ssize_t len;
4727 PyObject *v;
4728 int inShift = 0;
4729 Py_ssize_t i;
4730 unsigned int base64bits = 0;
4731 unsigned long base64buffer = 0;
4732 char * out;
4733 const char * start;
4734
4735 kind = PyUnicode_KIND(str);
4736 data = PyUnicode_DATA(str);
4737 len = PyUnicode_GET_LENGTH(str);
4738
4739 if (len == 0)
4740 return PyBytes_FromStringAndSize(NULL, 0);
4741
4742 /* It might be possible to tighten this worst case */
4743 if (len > PY_SSIZE_T_MAX / 8)
4744 return PyErr_NoMemory();
4745 v = PyBytes_FromStringAndSize(NULL, len * 8);
4746 if (v == NULL)
4747 return NULL;
4748
4749 start = out = PyBytes_AS_STRING(v);
4750 for (i = 0; i < len; ++i) {
4751 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4752
4753 if (inShift) {
4754 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4755 /* shifting out */
4756 if (base64bits) { /* output remaining bits */
4757 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4758 base64buffer = 0;
4759 base64bits = 0;
4760 }
4761 inShift = 0;
4762 /* Characters not in the BASE64 set implicitly unshift the sequence
4763 so no '-' is required, except if the character is itself a '-' */
4764 if (IS_BASE64(ch) || ch == '-') {
4765 *out++ = '-';
4766 }
4767 *out++ = (char) ch;
4768 }
4769 else {
4770 goto encode_char;
4771 }
4772 }
4773 else { /* not in a shift sequence */
4774 if (ch == '+') {
4775 *out++ = '+';
4776 *out++ = '-';
4777 }
4778 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 *out++ = (char) ch;
4780 }
4781 else {
4782 *out++ = '+';
4783 inShift = 1;
4784 goto encode_char;
4785 }
4786 }
4787 continue;
4788 encode_char:
4789 if (ch >= 0x10000) {
4790 assert(ch <= MAX_UNICODE);
4791
4792 /* code first surrogate */
4793 base64bits += 16;
4794 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4795 while (base64bits >= 6) {
4796 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4797 base64bits -= 6;
4798 }
4799 /* prepare second surrogate */
4800 ch = Py_UNICODE_LOW_SURROGATE(ch);
4801 }
4802 base64bits += 16;
4803 base64buffer = (base64buffer << 16) | ch;
4804 while (base64bits >= 6) {
4805 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4806 base64bits -= 6;
4807 }
4808 }
4809 if (base64bits)
4810 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4811 if (inShift)
4812 *out++ = '-';
4813 if (_PyBytes_Resize(&v, out - start) < 0)
4814 return NULL;
4815 return v;
4816 }
4817
4818 #undef IS_BASE64
4819 #undef FROM_BASE64
4820 #undef TO_BASE64
4821 #undef DECODE_DIRECT
4822 #undef ENCODE_DIRECT
4823
4824 /* --- UTF-8 Codec -------------------------------------------------------- */
4825
4826 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4827 PyUnicode_DecodeUTF8(const char *s,
4828 Py_ssize_t size,
4829 const char *errors)
4830 {
4831 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4832 }
4833
4834 #include "stringlib/asciilib.h"
4835 #include "stringlib/codecs.h"
4836 #include "stringlib/undef.h"
4837
4838 #include "stringlib/ucs1lib.h"
4839 #include "stringlib/codecs.h"
4840 #include "stringlib/undef.h"
4841
4842 #include "stringlib/ucs2lib.h"
4843 #include "stringlib/codecs.h"
4844 #include "stringlib/undef.h"
4845
4846 #include "stringlib/ucs4lib.h"
4847 #include "stringlib/codecs.h"
4848 #include "stringlib/undef.h"
4849
4850 /* Mask to quickly check whether a C 'size_t' contains a
4851 non-ASCII, UTF8-encoded char. */
4852 #if (SIZEOF_SIZE_T == 8)
4853 # define ASCII_CHAR_MASK 0x8080808080808080ULL
4854 #elif (SIZEOF_SIZE_T == 4)
4855 # define ASCII_CHAR_MASK 0x80808080U
4856 #else
4857 # error C 'size_t' size should be either 4 or 8!
4858 #endif
4859
4860 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4861 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4862 {
4863 const char *p = start;
4864
4865 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4866 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
4867 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4868 /* Fast path, see in STRINGLIB(utf8_decode) for
4869 an explanation. */
4870 /* Help allocation */
4871 const char *_p = p;
4872 Py_UCS1 * q = dest;
4873 while (_p + SIZEOF_SIZE_T <= end) {
4874 size_t value = *(const size_t *) _p;
4875 if (value & ASCII_CHAR_MASK)
4876 break;
4877 *((size_t *)q) = value;
4878 _p += SIZEOF_SIZE_T;
4879 q += SIZEOF_SIZE_T;
4880 }
4881 p = _p;
4882 while (p < end) {
4883 if ((unsigned char)*p & 0x80)
4884 break;
4885 *q++ = *p++;
4886 }
4887 return p - start;
4888 }
4889 #endif
4890 while (p < end) {
4891 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4892 for an explanation. */
4893 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4894 /* Help allocation */
4895 const char *_p = p;
4896 while (_p + SIZEOF_SIZE_T <= end) {
4897 size_t value = *(const size_t *) _p;
4898 if (value & ASCII_CHAR_MASK)
4899 break;
4900 _p += SIZEOF_SIZE_T;
4901 }
4902 p = _p;
4903 if (_p == end)
4904 break;
4905 }
4906 if ((unsigned char)*p & 0x80)
4907 break;
4908 ++p;
4909 }
4910 memcpy(dest, start, p - start);
4911 return p - start;
4912 }
4913
4914 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4915 unicode_decode_utf8(const char *s, Py_ssize_t size,
4916 _Py_error_handler error_handler, const char *errors,
4917 Py_ssize_t *consumed)
4918 {
4919 if (size == 0) {
4920 if (consumed)
4921 *consumed = 0;
4922 _Py_RETURN_UNICODE_EMPTY();
4923 }
4924
4925 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4926 if (size == 1 && (unsigned char)s[0] < 128) {
4927 if (consumed) {
4928 *consumed = 1;
4929 }
4930 return get_latin1_char((unsigned char)s[0]);
4931 }
4932
4933 const char *starts = s;
4934 const char *end = s + size;
4935
4936 // fast path: try ASCII string.
4937 PyObject *u = PyUnicode_New(size, 127);
4938 if (u == NULL) {
4939 return NULL;
4940 }
4941 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4942 if (s == end) {
4943 if (consumed) {
4944 *consumed = size;
4945 }
4946 return u;
4947 }
4948
4949 // Use _PyUnicodeWriter after fast path is failed.
4950 _PyUnicodeWriter writer;
4951 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4952 writer.pos = s - starts;
4953
4954 Py_ssize_t startinpos, endinpos;
4955 const char *errmsg = "";
4956 PyObject *error_handler_obj = NULL;
4957 PyObject *exc = NULL;
4958
4959 while (s < end) {
4960 Py_UCS4 ch;
4961 int kind = writer.kind;
4962
4963 if (kind == PyUnicode_1BYTE_KIND) {
4964 if (PyUnicode_IS_ASCII(writer.buffer))
4965 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4966 else
4967 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4968 } else if (kind == PyUnicode_2BYTE_KIND) {
4969 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4970 } else {
4971 assert(kind == PyUnicode_4BYTE_KIND);
4972 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4973 }
4974
4975 switch (ch) {
4976 case 0:
4977 if (s == end || consumed)
4978 goto End;
4979 errmsg = "unexpected end of data";
4980 startinpos = s - starts;
4981 endinpos = end - starts;
4982 break;
4983 case 1:
4984 errmsg = "invalid start byte";
4985 startinpos = s - starts;
4986 endinpos = startinpos + 1;
4987 break;
4988 case 2:
4989 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4990 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4991 {
4992 /* Truncated surrogate code in range D800-DFFF */
4993 goto End;
4994 }
4995 /* fall through */
4996 case 3:
4997 case 4:
4998 errmsg = "invalid continuation byte";
4999 startinpos = s - starts;
5000 endinpos = startinpos + ch - 1;
5001 break;
5002 default:
5003 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5004 goto onError;
5005 continue;
5006 }
5007
5008 if (error_handler == _Py_ERROR_UNKNOWN)
5009 error_handler = _Py_GetErrorHandler(errors);
5010
5011 switch (error_handler) {
5012 case _Py_ERROR_IGNORE:
5013 s += (endinpos - startinpos);
5014 break;
5015
5016 case _Py_ERROR_REPLACE:
5017 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5018 goto onError;
5019 s += (endinpos - startinpos);
5020 break;
5021
5022 case _Py_ERROR_SURROGATEESCAPE:
5023 {
5024 Py_ssize_t i;
5025
5026 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5027 goto onError;
5028 for (i=startinpos; i<endinpos; i++) {
5029 ch = (Py_UCS4)(unsigned char)(starts[i]);
5030 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5031 ch + 0xdc00);
5032 writer.pos++;
5033 }
5034 s += (endinpos - startinpos);
5035 break;
5036 }
5037
5038 default:
5039 if (unicode_decode_call_errorhandler_writer(
5040 errors, &error_handler_obj,
5041 "utf-8", errmsg,
5042 &starts, &end, &startinpos, &endinpos, &exc, &s,
5043 &writer))
5044 goto onError;
5045 }
5046 }
5047
5048 End:
5049 if (consumed)
5050 *consumed = s - starts;
5051
5052 Py_XDECREF(error_handler_obj);
5053 Py_XDECREF(exc);
5054 return _PyUnicodeWriter_Finish(&writer);
5055
5056 onError:
5057 Py_XDECREF(error_handler_obj);
5058 Py_XDECREF(exc);
5059 _PyUnicodeWriter_Dealloc(&writer);
5060 return NULL;
5061 }
5062
5063
5064 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5065 PyUnicode_DecodeUTF8Stateful(const char *s,
5066 Py_ssize_t size,
5067 const char *errors,
5068 Py_ssize_t *consumed)
5069 {
5070 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5071 }
5072
5073
5074 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5075 non-zero, use strict error handler otherwise.
5076
5077 On success, write a pointer to a newly allocated wide character string into
5078 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5079 (in number of wchar_t units) into *wlen (if wlen is set).
5080
5081 On memory allocation failure, return -1.
5082
5083 On decoding error (if surrogateescape is zero), return -2. If wlen is
5084 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5085 is not NULL, write the decoding error message into *reason. */
5086 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5087 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5088 const char **reason, _Py_error_handler errors)
5089 {
5090 const char *orig_s = s;
5091 const char *e;
5092 wchar_t *unicode;
5093 Py_ssize_t outpos;
5094
5095 int surrogateescape = 0;
5096 int surrogatepass = 0;
5097 switch (errors)
5098 {
5099 case _Py_ERROR_STRICT:
5100 break;
5101 case _Py_ERROR_SURROGATEESCAPE:
5102 surrogateescape = 1;
5103 break;
5104 case _Py_ERROR_SURROGATEPASS:
5105 surrogatepass = 1;
5106 break;
5107 default:
5108 return -3;
5109 }
5110
5111 /* Note: size will always be longer than the resulting Unicode
5112 character count */
5113 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5114 return -1;
5115 }
5116
5117 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5118 if (!unicode) {
5119 return -1;
5120 }
5121
5122 /* Unpack UTF-8 encoded data */
5123 e = s + size;
5124 outpos = 0;
5125 while (s < e) {
5126 Py_UCS4 ch;
5127 #if SIZEOF_WCHAR_T == 4
5128 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5129 #else
5130 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5131 #endif
5132 if (ch > 0xFF) {
5133 #if SIZEOF_WCHAR_T == 4
5134 Py_UNREACHABLE();
5135 #else
5136 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5137 /* write a surrogate pair */
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5139 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5140 #endif
5141 }
5142 else {
5143 if (!ch && s == e) {
5144 break;
5145 }
5146
5147 if (surrogateescape) {
5148 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5149 }
5150 else {
5151 /* Is it a valid three-byte code? */
5152 if (surrogatepass
5153 && (e - s) >= 3
5154 && (s[0] & 0xf0) == 0xe0
5155 && (s[1] & 0xc0) == 0x80
5156 && (s[2] & 0xc0) == 0x80)
5157 {
5158 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5159 s += 3;
5160 unicode[outpos++] = ch;
5161 }
5162 else {
5163 PyMem_RawFree(unicode );
5164 if (reason != NULL) {
5165 switch (ch) {
5166 case 0:
5167 *reason = "unexpected end of data";
5168 break;
5169 case 1:
5170 *reason = "invalid start byte";
5171 break;
5172 /* 2, 3, 4 */
5173 default:
5174 *reason = "invalid continuation byte";
5175 break;
5176 }
5177 }
5178 if (wlen != NULL) {
5179 *wlen = s - orig_s;
5180 }
5181 return -2;
5182 }
5183 }
5184 }
5185 }
5186 unicode[outpos] = L'\0';
5187 if (wlen) {
5188 *wlen = outpos;
5189 }
5190 *wstr = unicode;
5191 return 0;
5192 }
5193
5194
5195 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5196 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5197 size_t *wlen)
5198 {
5199 wchar_t *wstr;
5200 int res = _Py_DecodeUTF8Ex(arg, arglen,
5201 &wstr, wlen,
5202 NULL, _Py_ERROR_SURROGATEESCAPE);
5203 if (res != 0) {
5204 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5205 assert(res != -3);
5206 if (wlen) {
5207 *wlen = (size_t)res;
5208 }
5209 return NULL;
5210 }
5211 return wstr;
5212 }
5213
5214
5215 /* UTF-8 encoder using the surrogateescape error handler .
5216
5217 On success, return 0 and write the newly allocated character string (use
5218 PyMem_Free() to free the memory) into *str.
5219
5220 On encoding failure, return -2 and write the position of the invalid
5221 surrogate character into *error_pos (if error_pos is set) and the decoding
5222 error message into *reason (if reason is set).
5223
5224 On memory allocation failure, return -1. */
5225 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5226 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5227 const char **reason, int raw_malloc, _Py_error_handler errors)
5228 {
5229 const Py_ssize_t max_char_size = 4;
5230 Py_ssize_t len = wcslen(text);
5231
5232 assert(len >= 0);
5233
5234 int surrogateescape = 0;
5235 int surrogatepass = 0;
5236 switch (errors)
5237 {
5238 case _Py_ERROR_STRICT:
5239 break;
5240 case _Py_ERROR_SURROGATEESCAPE:
5241 surrogateescape = 1;
5242 break;
5243 case _Py_ERROR_SURROGATEPASS:
5244 surrogatepass = 1;
5245 break;
5246 default:
5247 return -3;
5248 }
5249
5250 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5251 return -1;
5252 }
5253 char *bytes;
5254 if (raw_malloc) {
5255 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5256 }
5257 else {
5258 bytes = PyMem_Malloc((len + 1) * max_char_size);
5259 }
5260 if (bytes == NULL) {
5261 return -1;
5262 }
5263
5264 char *p = bytes;
5265 Py_ssize_t i;
5266 for (i = 0; i < len; ) {
5267 Py_ssize_t ch_pos = i;
5268 Py_UCS4 ch = text[i];
5269 i++;
5270 #if Py_UNICODE_SIZE == 2
5271 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5272 && i < len
5273 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5274 {
5275 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5276 i++;
5277 }
5278 #endif
5279
5280 if (ch < 0x80) {
5281 /* Encode ASCII */
5282 *p++ = (char) ch;
5283
5284 }
5285 else if (ch < 0x0800) {
5286 /* Encode Latin-1 */
5287 *p++ = (char)(0xc0 | (ch >> 6));
5288 *p++ = (char)(0x80 | (ch & 0x3f));
5289 }
5290 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5291 /* surrogateescape error handler */
5292 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5293 if (error_pos != NULL) {
5294 *error_pos = (size_t)ch_pos;
5295 }
5296 if (reason != NULL) {
5297 *reason = "encoding error";
5298 }
5299 if (raw_malloc) {
5300 PyMem_RawFree(bytes);
5301 }
5302 else {
5303 PyMem_Free(bytes);
5304 }
5305 return -2;
5306 }
5307 *p++ = (char)(ch & 0xff);
5308 }
5309 else if (ch < 0x10000) {
5310 *p++ = (char)(0xe0 | (ch >> 12));
5311 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5312 *p++ = (char)(0x80 | (ch & 0x3f));
5313 }
5314 else { /* ch >= 0x10000 */
5315 assert(ch <= MAX_UNICODE);
5316 /* Encode UCS4 Unicode ordinals */
5317 *p++ = (char)(0xf0 | (ch >> 18));
5318 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5319 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5320 *p++ = (char)(0x80 | (ch & 0x3f));
5321 }
5322 }
5323 *p++ = '\0';
5324
5325 size_t final_size = (p - bytes);
5326 char *bytes2;
5327 if (raw_malloc) {
5328 bytes2 = PyMem_RawRealloc(bytes, final_size);
5329 }
5330 else {
5331 bytes2 = PyMem_Realloc(bytes, final_size);
5332 }
5333 if (bytes2 == NULL) {
5334 if (error_pos != NULL) {
5335 *error_pos = (size_t)-1;
5336 }
5337 if (raw_malloc) {
5338 PyMem_RawFree(bytes);
5339 }
5340 else {
5341 PyMem_Free(bytes);
5342 }
5343 return -1;
5344 }
5345 *str = bytes2;
5346 return 0;
5347 }
5348
5349
5350 /* Primary internal function which creates utf8 encoded bytes objects.
5351
5352 Allocation strategy: if the string is short, convert into a stack buffer
5353 and allocate exactly as much space needed at the end. Else allocate the
5354 maximum possible needed (4 result bytes per Unicode character), and return
5355 the excess memory at the end.
5356 */
5357 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5358 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5359 const char *errors)
5360 {
5361 if (!PyUnicode_Check(unicode)) {
5362 PyErr_BadArgument();
5363 return NULL;
5364 }
5365
5366 if (PyUnicode_UTF8(unicode))
5367 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5368 PyUnicode_UTF8_LENGTH(unicode));
5369
5370 int kind = PyUnicode_KIND(unicode);
5371 const void *data = PyUnicode_DATA(unicode);
5372 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5373
5374 _PyBytesWriter writer;
5375 char *end;
5376
5377 switch (kind) {
5378 default:
5379 Py_UNREACHABLE();
5380 case PyUnicode_1BYTE_KIND:
5381 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5382 assert(!PyUnicode_IS_ASCII(unicode));
5383 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5384 break;
5385 case PyUnicode_2BYTE_KIND:
5386 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5387 break;
5388 case PyUnicode_4BYTE_KIND:
5389 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5390 break;
5391 }
5392
5393 if (end == NULL) {
5394 _PyBytesWriter_Dealloc(&writer);
5395 return NULL;
5396 }
5397 return _PyBytesWriter_Finish(&writer, end);
5398 }
5399
5400 static int
unicode_fill_utf8(PyObject * unicode)5401 unicode_fill_utf8(PyObject *unicode)
5402 {
5403 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5404 assert(!PyUnicode_IS_ASCII(unicode));
5405
5406 int kind = PyUnicode_KIND(unicode);
5407 const void *data = PyUnicode_DATA(unicode);
5408 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5409
5410 _PyBytesWriter writer;
5411 char *end;
5412
5413 switch (kind) {
5414 default:
5415 Py_UNREACHABLE();
5416 case PyUnicode_1BYTE_KIND:
5417 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5418 _Py_ERROR_STRICT, NULL);
5419 break;
5420 case PyUnicode_2BYTE_KIND:
5421 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5422 _Py_ERROR_STRICT, NULL);
5423 break;
5424 case PyUnicode_4BYTE_KIND:
5425 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5426 _Py_ERROR_STRICT, NULL);
5427 break;
5428 }
5429 if (end == NULL) {
5430 _PyBytesWriter_Dealloc(&writer);
5431 return -1;
5432 }
5433
5434 const char *start = writer.use_small_buffer ? writer.small_buffer :
5435 PyBytes_AS_STRING(writer.buffer);
5436 Py_ssize_t len = end - start;
5437
5438 char *cache = PyMem_Malloc(len + 1);
5439 if (cache == NULL) {
5440 _PyBytesWriter_Dealloc(&writer);
5441 PyErr_NoMemory();
5442 return -1;
5443 }
5444 _PyUnicode_UTF8(unicode) = cache;
5445 _PyUnicode_UTF8_LENGTH(unicode) = len;
5446 memcpy(cache, start, len);
5447 cache[len] = '\0';
5448 _PyBytesWriter_Dealloc(&writer);
5449 return 0;
5450 }
5451
5452 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5453 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5454 {
5455 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5456 }
5457
5458
5459 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5460 PyUnicode_AsUTF8String(PyObject *unicode)
5461 {
5462 return _PyUnicode_AsUTF8String(unicode, NULL);
5463 }
5464
5465 /* --- UTF-32 Codec ------------------------------------------------------- */
5466
5467 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5468 PyUnicode_DecodeUTF32(const char *s,
5469 Py_ssize_t size,
5470 const char *errors,
5471 int *byteorder)
5472 {
5473 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5474 }
5475
5476 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5477 PyUnicode_DecodeUTF32Stateful(const char *s,
5478 Py_ssize_t size,
5479 const char *errors,
5480 int *byteorder,
5481 Py_ssize_t *consumed)
5482 {
5483 const char *starts = s;
5484 Py_ssize_t startinpos;
5485 Py_ssize_t endinpos;
5486 _PyUnicodeWriter writer;
5487 const unsigned char *q, *e;
5488 int le, bo = 0; /* assume native ordering by default */
5489 const char *encoding;
5490 const char *errmsg = "";
5491 PyObject *errorHandler = NULL;
5492 PyObject *exc = NULL;
5493
5494 q = (const unsigned char *)s;
5495 e = q + size;
5496
5497 if (byteorder)
5498 bo = *byteorder;
5499
5500 /* Check for BOM marks (U+FEFF) in the input and adjust current
5501 byte order setting accordingly. In native mode, the leading BOM
5502 mark is skipped, in all other modes, it is copied to the output
5503 stream as-is (giving a ZWNBSP character). */
5504 if (bo == 0 && size >= 4) {
5505 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5506 if (bom == 0x0000FEFF) {
5507 bo = -1;
5508 q += 4;
5509 }
5510 else if (bom == 0xFFFE0000) {
5511 bo = 1;
5512 q += 4;
5513 }
5514 if (byteorder)
5515 *byteorder = bo;
5516 }
5517
5518 if (q == e) {
5519 if (consumed)
5520 *consumed = size;
5521 _Py_RETURN_UNICODE_EMPTY();
5522 }
5523
5524 #ifdef WORDS_BIGENDIAN
5525 le = bo < 0;
5526 #else
5527 le = bo <= 0;
5528 #endif
5529 encoding = le ? "utf-32-le" : "utf-32-be";
5530
5531 _PyUnicodeWriter_Init(&writer);
5532 writer.min_length = (e - q + 3) / 4;
5533 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5534 goto onError;
5535
5536 while (1) {
5537 Py_UCS4 ch = 0;
5538 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5539
5540 if (e - q >= 4) {
5541 int kind = writer.kind;
5542 void *data = writer.data;
5543 const unsigned char *last = e - 4;
5544 Py_ssize_t pos = writer.pos;
5545 if (le) {
5546 do {
5547 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5548 if (ch > maxch)
5549 break;
5550 if (kind != PyUnicode_1BYTE_KIND &&
5551 Py_UNICODE_IS_SURROGATE(ch))
5552 break;
5553 PyUnicode_WRITE(kind, data, pos++, ch);
5554 q += 4;
5555 } while (q <= last);
5556 }
5557 else {
5558 do {
5559 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5560 if (ch > maxch)
5561 break;
5562 if (kind != PyUnicode_1BYTE_KIND &&
5563 Py_UNICODE_IS_SURROGATE(ch))
5564 break;
5565 PyUnicode_WRITE(kind, data, pos++, ch);
5566 q += 4;
5567 } while (q <= last);
5568 }
5569 writer.pos = pos;
5570 }
5571
5572 if (Py_UNICODE_IS_SURROGATE(ch)) {
5573 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5574 startinpos = ((const char *)q) - starts;
5575 endinpos = startinpos + 4;
5576 }
5577 else if (ch <= maxch) {
5578 if (q == e || consumed)
5579 break;
5580 /* remaining bytes at the end? (size should be divisible by 4) */
5581 errmsg = "truncated data";
5582 startinpos = ((const char *)q) - starts;
5583 endinpos = ((const char *)e) - starts;
5584 }
5585 else {
5586 if (ch < 0x110000) {
5587 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5588 goto onError;
5589 q += 4;
5590 continue;
5591 }
5592 errmsg = "code point not in range(0x110000)";
5593 startinpos = ((const char *)q) - starts;
5594 endinpos = startinpos + 4;
5595 }
5596
5597 /* The remaining input chars are ignored if the callback
5598 chooses to skip the input */
5599 if (unicode_decode_call_errorhandler_writer(
5600 errors, &errorHandler,
5601 encoding, errmsg,
5602 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5603 &writer))
5604 goto onError;
5605 }
5606
5607 if (consumed)
5608 *consumed = (const char *)q-starts;
5609
5610 Py_XDECREF(errorHandler);
5611 Py_XDECREF(exc);
5612 return _PyUnicodeWriter_Finish(&writer);
5613
5614 onError:
5615 _PyUnicodeWriter_Dealloc(&writer);
5616 Py_XDECREF(errorHandler);
5617 Py_XDECREF(exc);
5618 return NULL;
5619 }
5620
5621 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5622 _PyUnicode_EncodeUTF32(PyObject *str,
5623 const char *errors,
5624 int byteorder)
5625 {
5626 int kind;
5627 const void *data;
5628 Py_ssize_t len;
5629 PyObject *v;
5630 uint32_t *out;
5631 #if PY_LITTLE_ENDIAN
5632 int native_ordering = byteorder <= 0;
5633 #else
5634 int native_ordering = byteorder >= 0;
5635 #endif
5636 const char *encoding;
5637 Py_ssize_t nsize, pos;
5638 PyObject *errorHandler = NULL;
5639 PyObject *exc = NULL;
5640 PyObject *rep = NULL;
5641
5642 if (!PyUnicode_Check(str)) {
5643 PyErr_BadArgument();
5644 return NULL;
5645 }
5646 kind = PyUnicode_KIND(str);
5647 data = PyUnicode_DATA(str);
5648 len = PyUnicode_GET_LENGTH(str);
5649
5650 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5651 return PyErr_NoMemory();
5652 nsize = len + (byteorder == 0);
5653 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5654 if (v == NULL)
5655 return NULL;
5656
5657 /* output buffer is 4-bytes aligned */
5658 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5659 out = (uint32_t *)PyBytes_AS_STRING(v);
5660 if (byteorder == 0)
5661 *out++ = 0xFEFF;
5662 if (len == 0)
5663 goto done;
5664
5665 if (byteorder == -1)
5666 encoding = "utf-32-le";
5667 else if (byteorder == 1)
5668 encoding = "utf-32-be";
5669 else
5670 encoding = "utf-32";
5671
5672 if (kind == PyUnicode_1BYTE_KIND) {
5673 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5674 goto done;
5675 }
5676
5677 pos = 0;
5678 while (pos < len) {
5679 Py_ssize_t newpos, repsize, moreunits;
5680
5681 if (kind == PyUnicode_2BYTE_KIND) {
5682 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5683 &out, native_ordering);
5684 }
5685 else {
5686 assert(kind == PyUnicode_4BYTE_KIND);
5687 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5688 &out, native_ordering);
5689 }
5690 if (pos == len)
5691 break;
5692
5693 rep = unicode_encode_call_errorhandler(
5694 errors, &errorHandler,
5695 encoding, "surrogates not allowed",
5696 str, &exc, pos, pos + 1, &newpos);
5697 if (!rep)
5698 goto error;
5699
5700 if (PyBytes_Check(rep)) {
5701 repsize = PyBytes_GET_SIZE(rep);
5702 if (repsize & 3) {
5703 raise_encode_exception(&exc, encoding,
5704 str, pos, pos + 1,
5705 "surrogates not allowed");
5706 goto error;
5707 }
5708 moreunits = repsize / 4;
5709 }
5710 else {
5711 assert(PyUnicode_Check(rep));
5712 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5713 if (!PyUnicode_IS_ASCII(rep)) {
5714 raise_encode_exception(&exc, encoding,
5715 str, pos, pos + 1,
5716 "surrogates not allowed");
5717 goto error;
5718 }
5719 }
5720 moreunits += pos - newpos;
5721 pos = newpos;
5722
5723 /* four bytes are reserved for each surrogate */
5724 if (moreunits > 0) {
5725 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5726 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5727 /* integer overflow */
5728 PyErr_NoMemory();
5729 goto error;
5730 }
5731 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5732 goto error;
5733 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5734 }
5735
5736 if (PyBytes_Check(rep)) {
5737 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5738 out += repsize / 4;
5739 } else /* rep is unicode */ {
5740 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5741 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5742 &out, native_ordering);
5743 }
5744
5745 Py_CLEAR(rep);
5746 }
5747
5748 /* Cut back to size actually needed. This is necessary for, for example,
5749 encoding of a string containing isolated surrogates and the 'ignore'
5750 handler is used. */
5751 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5752 if (nsize != PyBytes_GET_SIZE(v))
5753 _PyBytes_Resize(&v, nsize);
5754 Py_XDECREF(errorHandler);
5755 Py_XDECREF(exc);
5756 done:
5757 return v;
5758 error:
5759 Py_XDECREF(rep);
5760 Py_XDECREF(errorHandler);
5761 Py_XDECREF(exc);
5762 Py_XDECREF(v);
5763 return NULL;
5764 }
5765
5766 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5767 PyUnicode_AsUTF32String(PyObject *unicode)
5768 {
5769 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5770 }
5771
5772 /* --- UTF-16 Codec ------------------------------------------------------- */
5773
5774 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5775 PyUnicode_DecodeUTF16(const char *s,
5776 Py_ssize_t size,
5777 const char *errors,
5778 int *byteorder)
5779 {
5780 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5781 }
5782
5783 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5784 PyUnicode_DecodeUTF16Stateful(const char *s,
5785 Py_ssize_t size,
5786 const char *errors,
5787 int *byteorder,
5788 Py_ssize_t *consumed)
5789 {
5790 const char *starts = s;
5791 Py_ssize_t startinpos;
5792 Py_ssize_t endinpos;
5793 _PyUnicodeWriter writer;
5794 const unsigned char *q, *e;
5795 int bo = 0; /* assume native ordering by default */
5796 int native_ordering;
5797 const char *errmsg = "";
5798 PyObject *errorHandler = NULL;
5799 PyObject *exc = NULL;
5800 const char *encoding;
5801
5802 q = (const unsigned char *)s;
5803 e = q + size;
5804
5805 if (byteorder)
5806 bo = *byteorder;
5807
5808 /* Check for BOM marks (U+FEFF) in the input and adjust current
5809 byte order setting accordingly. In native mode, the leading BOM
5810 mark is skipped, in all other modes, it is copied to the output
5811 stream as-is (giving a ZWNBSP character). */
5812 if (bo == 0 && size >= 2) {
5813 const Py_UCS4 bom = (q[1] << 8) | q[0];
5814 if (bom == 0xFEFF) {
5815 q += 2;
5816 bo = -1;
5817 }
5818 else if (bom == 0xFFFE) {
5819 q += 2;
5820 bo = 1;
5821 }
5822 if (byteorder)
5823 *byteorder = bo;
5824 }
5825
5826 if (q == e) {
5827 if (consumed)
5828 *consumed = size;
5829 _Py_RETURN_UNICODE_EMPTY();
5830 }
5831
5832 #if PY_LITTLE_ENDIAN
5833 native_ordering = bo <= 0;
5834 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5835 #else
5836 native_ordering = bo >= 0;
5837 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5838 #endif
5839
5840 /* Note: size will always be longer than the resulting Unicode
5841 character count normally. Error handler will take care of
5842 resizing when needed. */
5843 _PyUnicodeWriter_Init(&writer);
5844 writer.min_length = (e - q + 1) / 2;
5845 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5846 goto onError;
5847
5848 while (1) {
5849 Py_UCS4 ch = 0;
5850 if (e - q >= 2) {
5851 int kind = writer.kind;
5852 if (kind == PyUnicode_1BYTE_KIND) {
5853 if (PyUnicode_IS_ASCII(writer.buffer))
5854 ch = asciilib_utf16_decode(&q, e,
5855 (Py_UCS1*)writer.data, &writer.pos,
5856 native_ordering);
5857 else
5858 ch = ucs1lib_utf16_decode(&q, e,
5859 (Py_UCS1*)writer.data, &writer.pos,
5860 native_ordering);
5861 } else if (kind == PyUnicode_2BYTE_KIND) {
5862 ch = ucs2lib_utf16_decode(&q, e,
5863 (Py_UCS2*)writer.data, &writer.pos,
5864 native_ordering);
5865 } else {
5866 assert(kind == PyUnicode_4BYTE_KIND);
5867 ch = ucs4lib_utf16_decode(&q, e,
5868 (Py_UCS4*)writer.data, &writer.pos,
5869 native_ordering);
5870 }
5871 }
5872
5873 switch (ch)
5874 {
5875 case 0:
5876 /* remaining byte at the end? (size should be even) */
5877 if (q == e || consumed)
5878 goto End;
5879 errmsg = "truncated data";
5880 startinpos = ((const char *)q) - starts;
5881 endinpos = ((const char *)e) - starts;
5882 break;
5883 /* The remaining input chars are ignored if the callback
5884 chooses to skip the input */
5885 case 1:
5886 q -= 2;
5887 if (consumed)
5888 goto End;
5889 errmsg = "unexpected end of data";
5890 startinpos = ((const char *)q) - starts;
5891 endinpos = ((const char *)e) - starts;
5892 break;
5893 case 2:
5894 errmsg = "illegal encoding";
5895 startinpos = ((const char *)q) - 2 - starts;
5896 endinpos = startinpos + 2;
5897 break;
5898 case 3:
5899 errmsg = "illegal UTF-16 surrogate";
5900 startinpos = ((const char *)q) - 4 - starts;
5901 endinpos = startinpos + 2;
5902 break;
5903 default:
5904 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5905 goto onError;
5906 continue;
5907 }
5908
5909 if (unicode_decode_call_errorhandler_writer(
5910 errors,
5911 &errorHandler,
5912 encoding, errmsg,
5913 &starts,
5914 (const char **)&e,
5915 &startinpos,
5916 &endinpos,
5917 &exc,
5918 (const char **)&q,
5919 &writer))
5920 goto onError;
5921 }
5922
5923 End:
5924 if (consumed)
5925 *consumed = (const char *)q-starts;
5926
5927 Py_XDECREF(errorHandler);
5928 Py_XDECREF(exc);
5929 return _PyUnicodeWriter_Finish(&writer);
5930
5931 onError:
5932 _PyUnicodeWriter_Dealloc(&writer);
5933 Py_XDECREF(errorHandler);
5934 Py_XDECREF(exc);
5935 return NULL;
5936 }
5937
5938 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5939 _PyUnicode_EncodeUTF16(PyObject *str,
5940 const char *errors,
5941 int byteorder)
5942 {
5943 int kind;
5944 const void *data;
5945 Py_ssize_t len;
5946 PyObject *v;
5947 unsigned short *out;
5948 Py_ssize_t pairs;
5949 #if PY_BIG_ENDIAN
5950 int native_ordering = byteorder >= 0;
5951 #else
5952 int native_ordering = byteorder <= 0;
5953 #endif
5954 const char *encoding;
5955 Py_ssize_t nsize, pos;
5956 PyObject *errorHandler = NULL;
5957 PyObject *exc = NULL;
5958 PyObject *rep = NULL;
5959
5960 if (!PyUnicode_Check(str)) {
5961 PyErr_BadArgument();
5962 return NULL;
5963 }
5964 kind = PyUnicode_KIND(str);
5965 data = PyUnicode_DATA(str);
5966 len = PyUnicode_GET_LENGTH(str);
5967
5968 pairs = 0;
5969 if (kind == PyUnicode_4BYTE_KIND) {
5970 const Py_UCS4 *in = (const Py_UCS4 *)data;
5971 const Py_UCS4 *end = in + len;
5972 while (in < end) {
5973 if (*in++ >= 0x10000) {
5974 pairs++;
5975 }
5976 }
5977 }
5978 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5979 return PyErr_NoMemory();
5980 }
5981 nsize = len + pairs + (byteorder == 0);
5982 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5983 if (v == NULL) {
5984 return NULL;
5985 }
5986
5987 /* output buffer is 2-bytes aligned */
5988 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5989 out = (unsigned short *)PyBytes_AS_STRING(v);
5990 if (byteorder == 0) {
5991 *out++ = 0xFEFF;
5992 }
5993 if (len == 0) {
5994 goto done;
5995 }
5996
5997 if (kind == PyUnicode_1BYTE_KIND) {
5998 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5999 goto done;
6000 }
6001
6002 if (byteorder < 0) {
6003 encoding = "utf-16-le";
6004 }
6005 else if (byteorder > 0) {
6006 encoding = "utf-16-be";
6007 }
6008 else {
6009 encoding = "utf-16";
6010 }
6011
6012 pos = 0;
6013 while (pos < len) {
6014 Py_ssize_t newpos, repsize, moreunits;
6015
6016 if (kind == PyUnicode_2BYTE_KIND) {
6017 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6018 &out, native_ordering);
6019 }
6020 else {
6021 assert(kind == PyUnicode_4BYTE_KIND);
6022 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6023 &out, native_ordering);
6024 }
6025 if (pos == len)
6026 break;
6027
6028 rep = unicode_encode_call_errorhandler(
6029 errors, &errorHandler,
6030 encoding, "surrogates not allowed",
6031 str, &exc, pos, pos + 1, &newpos);
6032 if (!rep)
6033 goto error;
6034
6035 if (PyBytes_Check(rep)) {
6036 repsize = PyBytes_GET_SIZE(rep);
6037 if (repsize & 1) {
6038 raise_encode_exception(&exc, encoding,
6039 str, pos, pos + 1,
6040 "surrogates not allowed");
6041 goto error;
6042 }
6043 moreunits = repsize / 2;
6044 }
6045 else {
6046 assert(PyUnicode_Check(rep));
6047 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6048 if (!PyUnicode_IS_ASCII(rep)) {
6049 raise_encode_exception(&exc, encoding,
6050 str, pos, pos + 1,
6051 "surrogates not allowed");
6052 goto error;
6053 }
6054 }
6055 moreunits += pos - newpos;
6056 pos = newpos;
6057
6058 /* two bytes are reserved for each surrogate */
6059 if (moreunits > 0) {
6060 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6061 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6062 /* integer overflow */
6063 PyErr_NoMemory();
6064 goto error;
6065 }
6066 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6067 goto error;
6068 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6069 }
6070
6071 if (PyBytes_Check(rep)) {
6072 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6073 out += repsize / 2;
6074 } else /* rep is unicode */ {
6075 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6076 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6077 &out, native_ordering);
6078 }
6079
6080 Py_CLEAR(rep);
6081 }
6082
6083 /* Cut back to size actually needed. This is necessary for, for example,
6084 encoding of a string containing isolated surrogates and the 'ignore' handler
6085 is used. */
6086 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6087 if (nsize != PyBytes_GET_SIZE(v))
6088 _PyBytes_Resize(&v, nsize);
6089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
6091 done:
6092 return v;
6093 error:
6094 Py_XDECREF(rep);
6095 Py_XDECREF(errorHandler);
6096 Py_XDECREF(exc);
6097 Py_XDECREF(v);
6098 return NULL;
6099 #undef STORECHAR
6100 }
6101
6102 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6103 PyUnicode_AsUTF16String(PyObject *unicode)
6104 {
6105 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6106 }
6107
6108 _PyUnicode_Name_CAPI *
_PyUnicode_GetNameCAPI(void)6109 _PyUnicode_GetNameCAPI(void)
6110 {
6111 PyInterpreterState *interp = _PyInterpreterState_GET();
6112 _PyUnicode_Name_CAPI *ucnhash_capi;
6113
6114 ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6115 if (ucnhash_capi == NULL) {
6116 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6117 PyUnicodeData_CAPSULE_NAME, 1);
6118
6119 // It's fine if we overwite the value here. It's always the same value.
6120 _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6121 }
6122 return ucnhash_capi;
6123 }
6124
6125 /* --- Unicode Escape Codec ----------------------------------------------- */
6126
6127 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6128 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6129 Py_ssize_t size,
6130 const char *errors,
6131 Py_ssize_t *consumed,
6132 const char **first_invalid_escape)
6133 {
6134 const char *starts = s;
6135 _PyUnicodeWriter writer;
6136 const char *end;
6137 PyObject *errorHandler = NULL;
6138 PyObject *exc = NULL;
6139 _PyUnicode_Name_CAPI *ucnhash_capi;
6140
6141 // so we can remember if we've seen an invalid escape char or not
6142 *first_invalid_escape = NULL;
6143
6144 if (size == 0) {
6145 if (consumed) {
6146 *consumed = 0;
6147 }
6148 _Py_RETURN_UNICODE_EMPTY();
6149 }
6150 /* Escaped strings will always be longer than the resulting
6151 Unicode string, so we start with size here and then reduce the
6152 length after conversion to the true value.
6153 (but if the error callback returns a long replacement string
6154 we'll have to allocate more space) */
6155 _PyUnicodeWriter_Init(&writer);
6156 writer.min_length = size;
6157 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6158 goto onError;
6159 }
6160
6161 end = s + size;
6162 while (s < end) {
6163 unsigned char c = (unsigned char) *s++;
6164 Py_UCS4 ch;
6165 int count;
6166 const char *message;
6167
6168 #define WRITE_ASCII_CHAR(ch) \
6169 do { \
6170 assert(ch <= 127); \
6171 assert(writer.pos < writer.size); \
6172 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6173 } while(0)
6174
6175 #define WRITE_CHAR(ch) \
6176 do { \
6177 if (ch <= writer.maxchar) { \
6178 assert(writer.pos < writer.size); \
6179 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6180 } \
6181 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6182 goto onError; \
6183 } \
6184 } while(0)
6185
6186 /* Non-escape characters are interpreted as Unicode ordinals */
6187 if (c != '\\') {
6188 WRITE_CHAR(c);
6189 continue;
6190 }
6191
6192 Py_ssize_t startinpos = s - starts - 1;
6193 /* \ - Escapes */
6194 if (s >= end) {
6195 message = "\\ at end of string";
6196 goto incomplete;
6197 }
6198 c = (unsigned char) *s++;
6199
6200 assert(writer.pos < writer.size);
6201 switch (c) {
6202
6203 /* \x escapes */
6204 case '\n': continue;
6205 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6206 case '\'': WRITE_ASCII_CHAR('\''); continue;
6207 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6208 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6209 /* FF */
6210 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6211 case 't': WRITE_ASCII_CHAR('\t'); continue;
6212 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6213 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6214 /* VT */
6215 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6216 /* BEL, not classic C */
6217 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6218
6219 /* \OOO (octal) escapes */
6220 case '0': case '1': case '2': case '3':
6221 case '4': case '5': case '6': case '7':
6222 ch = c - '0';
6223 if (s < end && '0' <= *s && *s <= '7') {
6224 ch = (ch<<3) + *s++ - '0';
6225 if (s < end && '0' <= *s && *s <= '7') {
6226 ch = (ch<<3) + *s++ - '0';
6227 }
6228 }
6229 if (ch > 0377) {
6230 if (*first_invalid_escape == NULL) {
6231 *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6232 already incremented s. */
6233 }
6234 }
6235 WRITE_CHAR(ch);
6236 continue;
6237
6238 /* hex escapes */
6239 /* \xXX */
6240 case 'x':
6241 count = 2;
6242 message = "truncated \\xXX escape";
6243 goto hexescape;
6244
6245 /* \uXXXX */
6246 case 'u':
6247 count = 4;
6248 message = "truncated \\uXXXX escape";
6249 goto hexescape;
6250
6251 /* \UXXXXXXXX */
6252 case 'U':
6253 count = 8;
6254 message = "truncated \\UXXXXXXXX escape";
6255 hexescape:
6256 for (ch = 0; count; ++s, --count) {
6257 if (s >= end) {
6258 goto incomplete;
6259 }
6260 c = (unsigned char)*s;
6261 ch <<= 4;
6262 if (c >= '0' && c <= '9') {
6263 ch += c - '0';
6264 }
6265 else if (c >= 'a' && c <= 'f') {
6266 ch += c - ('a' - 10);
6267 }
6268 else if (c >= 'A' && c <= 'F') {
6269 ch += c - ('A' - 10);
6270 }
6271 else {
6272 goto error;
6273 }
6274 }
6275
6276 /* when we get here, ch is a 32-bit unicode character */
6277 if (ch > MAX_UNICODE) {
6278 message = "illegal Unicode character";
6279 goto error;
6280 }
6281
6282 WRITE_CHAR(ch);
6283 continue;
6284
6285 /* \N{name} */
6286 case 'N':
6287 ucnhash_capi = _PyUnicode_GetNameCAPI();
6288 if (ucnhash_capi == NULL) {
6289 PyErr_SetString(
6290 PyExc_UnicodeError,
6291 "\\N escapes not supported (can't load unicodedata module)"
6292 );
6293 goto onError;
6294 }
6295
6296 message = "malformed \\N character escape";
6297 if (s >= end) {
6298 goto incomplete;
6299 }
6300 if (*s == '{') {
6301 const char *start = ++s;
6302 size_t namelen;
6303 /* look for the closing brace */
6304 while (s < end && *s != '}')
6305 s++;
6306 if (s >= end) {
6307 goto incomplete;
6308 }
6309 namelen = s - start;
6310 if (namelen) {
6311 /* found a name. look it up in the unicode database */
6312 s++;
6313 ch = 0xffffffff; /* in case 'getcode' messes up */
6314 if (namelen <= INT_MAX &&
6315 ucnhash_capi->getcode(start, (int)namelen,
6316 &ch, 0)) {
6317 assert(ch <= MAX_UNICODE);
6318 WRITE_CHAR(ch);
6319 continue;
6320 }
6321 message = "unknown Unicode character name";
6322 }
6323 }
6324 goto error;
6325
6326 default:
6327 if (*first_invalid_escape == NULL) {
6328 *first_invalid_escape = s-1; /* Back up one char, since we've
6329 already incremented s. */
6330 }
6331 WRITE_ASCII_CHAR('\\');
6332 WRITE_CHAR(c);
6333 continue;
6334 }
6335
6336 incomplete:
6337 if (consumed) {
6338 *consumed = startinpos;
6339 break;
6340 }
6341 error:;
6342 Py_ssize_t endinpos = s-starts;
6343 writer.min_length = end - s + writer.pos;
6344 if (unicode_decode_call_errorhandler_writer(
6345 errors, &errorHandler,
6346 "unicodeescape", message,
6347 &starts, &end, &startinpos, &endinpos, &exc, &s,
6348 &writer)) {
6349 goto onError;
6350 }
6351 assert(end - s <= writer.size - writer.pos);
6352
6353 #undef WRITE_ASCII_CHAR
6354 #undef WRITE_CHAR
6355 }
6356
6357 Py_XDECREF(errorHandler);
6358 Py_XDECREF(exc);
6359 return _PyUnicodeWriter_Finish(&writer);
6360
6361 onError:
6362 _PyUnicodeWriter_Dealloc(&writer);
6363 Py_XDECREF(errorHandler);
6364 Py_XDECREF(exc);
6365 return NULL;
6366 }
6367
6368 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6369 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6370 Py_ssize_t size,
6371 const char *errors,
6372 Py_ssize_t *consumed)
6373 {
6374 const char *first_invalid_escape;
6375 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6376 consumed,
6377 &first_invalid_escape);
6378 if (result == NULL)
6379 return NULL;
6380 if (first_invalid_escape != NULL) {
6381 unsigned char c = *first_invalid_escape;
6382 if ('4' <= c && c <= '7') {
6383 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6384 "invalid octal escape sequence '\\%.3s'",
6385 first_invalid_escape) < 0)
6386 {
6387 Py_DECREF(result);
6388 return NULL;
6389 }
6390 }
6391 else {
6392 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6393 "invalid escape sequence '\\%c'",
6394 c) < 0)
6395 {
6396 Py_DECREF(result);
6397 return NULL;
6398 }
6399 }
6400 }
6401 return result;
6402 }
6403
6404 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6405 PyUnicode_DecodeUnicodeEscape(const char *s,
6406 Py_ssize_t size,
6407 const char *errors)
6408 {
6409 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6410 }
6411
6412 /* Return a Unicode-Escape string version of the Unicode object. */
6413
6414 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6415 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6416 {
6417 Py_ssize_t i, len;
6418 PyObject *repr;
6419 char *p;
6420 int kind;
6421 const void *data;
6422 Py_ssize_t expandsize;
6423
6424 /* Initial allocation is based on the longest-possible character
6425 escape.
6426
6427 For UCS1 strings it's '\xxx', 4 bytes per source character.
6428 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6429 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6430 */
6431
6432 if (!PyUnicode_Check(unicode)) {
6433 PyErr_BadArgument();
6434 return NULL;
6435 }
6436
6437 len = PyUnicode_GET_LENGTH(unicode);
6438 if (len == 0) {
6439 return PyBytes_FromStringAndSize(NULL, 0);
6440 }
6441
6442 kind = PyUnicode_KIND(unicode);
6443 data = PyUnicode_DATA(unicode);
6444 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6445 bytes, and 1 byte characters 4. */
6446 expandsize = kind * 2 + 2;
6447 if (len > PY_SSIZE_T_MAX / expandsize) {
6448 return PyErr_NoMemory();
6449 }
6450 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6451 if (repr == NULL) {
6452 return NULL;
6453 }
6454
6455 p = PyBytes_AS_STRING(repr);
6456 for (i = 0; i < len; i++) {
6457 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6458
6459 /* U+0000-U+00ff range */
6460 if (ch < 0x100) {
6461 if (ch >= ' ' && ch < 127) {
6462 if (ch != '\\') {
6463 /* Copy printable US ASCII as-is */
6464 *p++ = (char) ch;
6465 }
6466 /* Escape backslashes */
6467 else {
6468 *p++ = '\\';
6469 *p++ = '\\';
6470 }
6471 }
6472
6473 /* Map special whitespace to '\t', \n', '\r' */
6474 else if (ch == '\t') {
6475 *p++ = '\\';
6476 *p++ = 't';
6477 }
6478 else if (ch == '\n') {
6479 *p++ = '\\';
6480 *p++ = 'n';
6481 }
6482 else if (ch == '\r') {
6483 *p++ = '\\';
6484 *p++ = 'r';
6485 }
6486
6487 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6488 else {
6489 *p++ = '\\';
6490 *p++ = 'x';
6491 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6492 *p++ = Py_hexdigits[ch & 0x000F];
6493 }
6494 }
6495 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6496 else if (ch < 0x10000) {
6497 *p++ = '\\';
6498 *p++ = 'u';
6499 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6500 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6501 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6502 *p++ = Py_hexdigits[ch & 0x000F];
6503 }
6504 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6505 else {
6506
6507 /* Make sure that the first two digits are zero */
6508 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6509 *p++ = '\\';
6510 *p++ = 'U';
6511 *p++ = '0';
6512 *p++ = '0';
6513 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6514 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6515 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6516 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6517 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6518 *p++ = Py_hexdigits[ch & 0x0000000F];
6519 }
6520 }
6521
6522 assert(p - PyBytes_AS_STRING(repr) > 0);
6523 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6524 return NULL;
6525 }
6526 return repr;
6527 }
6528
6529 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6530
6531 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6532 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6533 Py_ssize_t size,
6534 const char *errors,
6535 Py_ssize_t *consumed)
6536 {
6537 const char *starts = s;
6538 _PyUnicodeWriter writer;
6539 const char *end;
6540 PyObject *errorHandler = NULL;
6541 PyObject *exc = NULL;
6542
6543 if (size == 0) {
6544 if (consumed) {
6545 *consumed = 0;
6546 }
6547 _Py_RETURN_UNICODE_EMPTY();
6548 }
6549
6550 /* Escaped strings will always be longer than the resulting
6551 Unicode string, so we start with size here and then reduce the
6552 length after conversion to the true value. (But decoding error
6553 handler might have to resize the string) */
6554 _PyUnicodeWriter_Init(&writer);
6555 writer.min_length = size;
6556 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6557 goto onError;
6558 }
6559
6560 end = s + size;
6561 while (s < end) {
6562 unsigned char c = (unsigned char) *s++;
6563 Py_UCS4 ch;
6564 int count;
6565 const char *message;
6566
6567 #define WRITE_CHAR(ch) \
6568 do { \
6569 if (ch <= writer.maxchar) { \
6570 assert(writer.pos < writer.size); \
6571 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6572 } \
6573 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6574 goto onError; \
6575 } \
6576 } while(0)
6577
6578 /* Non-escape characters are interpreted as Unicode ordinals */
6579 if (c != '\\' || (s >= end && !consumed)) {
6580 WRITE_CHAR(c);
6581 continue;
6582 }
6583
6584 Py_ssize_t startinpos = s - starts - 1;
6585 /* \ - Escapes */
6586 if (s >= end) {
6587 assert(consumed);
6588 // Set message to silent compiler warning.
6589 // Actually it is never used.
6590 message = "\\ at end of string";
6591 goto incomplete;
6592 }
6593
6594 c = (unsigned char) *s++;
6595 if (c == 'u') {
6596 count = 4;
6597 message = "truncated \\uXXXX escape";
6598 }
6599 else if (c == 'U') {
6600 count = 8;
6601 message = "truncated \\UXXXXXXXX escape";
6602 }
6603 else {
6604 assert(writer.pos < writer.size);
6605 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6606 WRITE_CHAR(c);
6607 continue;
6608 }
6609
6610 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6611 for (ch = 0; count; ++s, --count) {
6612 if (s >= end) {
6613 goto incomplete;
6614 }
6615 c = (unsigned char)*s;
6616 ch <<= 4;
6617 if (c >= '0' && c <= '9') {
6618 ch += c - '0';
6619 }
6620 else if (c >= 'a' && c <= 'f') {
6621 ch += c - ('a' - 10);
6622 }
6623 else if (c >= 'A' && c <= 'F') {
6624 ch += c - ('A' - 10);
6625 }
6626 else {
6627 goto error;
6628 }
6629 }
6630 if (ch > MAX_UNICODE) {
6631 message = "\\Uxxxxxxxx out of range";
6632 goto error;
6633 }
6634 WRITE_CHAR(ch);
6635 continue;
6636
6637 incomplete:
6638 if (consumed) {
6639 *consumed = startinpos;
6640 break;
6641 }
6642 error:;
6643 Py_ssize_t endinpos = s-starts;
6644 writer.min_length = end - s + writer.pos;
6645 if (unicode_decode_call_errorhandler_writer(
6646 errors, &errorHandler,
6647 "rawunicodeescape", message,
6648 &starts, &end, &startinpos, &endinpos, &exc, &s,
6649 &writer)) {
6650 goto onError;
6651 }
6652 assert(end - s <= writer.size - writer.pos);
6653
6654 #undef WRITE_CHAR
6655 }
6656 Py_XDECREF(errorHandler);
6657 Py_XDECREF(exc);
6658 return _PyUnicodeWriter_Finish(&writer);
6659
6660 onError:
6661 _PyUnicodeWriter_Dealloc(&writer);
6662 Py_XDECREF(errorHandler);
6663 Py_XDECREF(exc);
6664 return NULL;
6665 }
6666
6667 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6668 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6669 Py_ssize_t size,
6670 const char *errors)
6671 {
6672 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6673 }
6674
6675
6676 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6677 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6678 {
6679 PyObject *repr;
6680 char *p;
6681 Py_ssize_t expandsize, pos;
6682 int kind;
6683 const void *data;
6684 Py_ssize_t len;
6685
6686 if (!PyUnicode_Check(unicode)) {
6687 PyErr_BadArgument();
6688 return NULL;
6689 }
6690 kind = PyUnicode_KIND(unicode);
6691 data = PyUnicode_DATA(unicode);
6692 len = PyUnicode_GET_LENGTH(unicode);
6693 if (kind == PyUnicode_1BYTE_KIND) {
6694 return PyBytes_FromStringAndSize(data, len);
6695 }
6696
6697 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6698 bytes, and 1 byte characters 4. */
6699 expandsize = kind * 2 + 2;
6700
6701 if (len > PY_SSIZE_T_MAX / expandsize) {
6702 return PyErr_NoMemory();
6703 }
6704 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6705 if (repr == NULL) {
6706 return NULL;
6707 }
6708 if (len == 0) {
6709 return repr;
6710 }
6711
6712 p = PyBytes_AS_STRING(repr);
6713 for (pos = 0; pos < len; pos++) {
6714 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6715
6716 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6717 if (ch < 0x100) {
6718 *p++ = (char) ch;
6719 }
6720 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6721 else if (ch < 0x10000) {
6722 *p++ = '\\';
6723 *p++ = 'u';
6724 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6725 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6726 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6727 *p++ = Py_hexdigits[ch & 15];
6728 }
6729 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6730 else {
6731 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6732 *p++ = '\\';
6733 *p++ = 'U';
6734 *p++ = '0';
6735 *p++ = '0';
6736 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6737 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6738 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6739 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6740 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6741 *p++ = Py_hexdigits[ch & 15];
6742 }
6743 }
6744
6745 assert(p > PyBytes_AS_STRING(repr));
6746 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6747 return NULL;
6748 }
6749 return repr;
6750 }
6751
6752 /* --- Latin-1 Codec ------------------------------------------------------ */
6753
6754 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6755 PyUnicode_DecodeLatin1(const char *s,
6756 Py_ssize_t size,
6757 const char *errors)
6758 {
6759 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6760 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6761 }
6762
6763 /* create or adjust a UnicodeEncodeError */
6764 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6765 make_encode_exception(PyObject **exceptionObject,
6766 const char *encoding,
6767 PyObject *unicode,
6768 Py_ssize_t startpos, Py_ssize_t endpos,
6769 const char *reason)
6770 {
6771 if (*exceptionObject == NULL) {
6772 *exceptionObject = PyObject_CallFunction(
6773 PyExc_UnicodeEncodeError, "sOnns",
6774 encoding, unicode, startpos, endpos, reason);
6775 }
6776 else {
6777 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6778 goto onError;
6779 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6780 goto onError;
6781 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6782 goto onError;
6783 return;
6784 onError:
6785 Py_CLEAR(*exceptionObject);
6786 }
6787 }
6788
6789 /* raises a UnicodeEncodeError */
6790 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6791 raise_encode_exception(PyObject **exceptionObject,
6792 const char *encoding,
6793 PyObject *unicode,
6794 Py_ssize_t startpos, Py_ssize_t endpos,
6795 const char *reason)
6796 {
6797 make_encode_exception(exceptionObject,
6798 encoding, unicode, startpos, endpos, reason);
6799 if (*exceptionObject != NULL)
6800 PyCodec_StrictErrors(*exceptionObject);
6801 }
6802
6803 /* error handling callback helper:
6804 build arguments, call the callback and check the arguments,
6805 put the result into newpos and return the replacement string, which
6806 has to be freed by the caller */
6807 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6808 unicode_encode_call_errorhandler(const char *errors,
6809 PyObject **errorHandler,
6810 const char *encoding, const char *reason,
6811 PyObject *unicode, PyObject **exceptionObject,
6812 Py_ssize_t startpos, Py_ssize_t endpos,
6813 Py_ssize_t *newpos)
6814 {
6815 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6816 Py_ssize_t len;
6817 PyObject *restuple;
6818 PyObject *resunicode;
6819
6820 if (*errorHandler == NULL) {
6821 *errorHandler = PyCodec_LookupError(errors);
6822 if (*errorHandler == NULL)
6823 return NULL;
6824 }
6825
6826 len = PyUnicode_GET_LENGTH(unicode);
6827
6828 make_encode_exception(exceptionObject,
6829 encoding, unicode, startpos, endpos, reason);
6830 if (*exceptionObject == NULL)
6831 return NULL;
6832
6833 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6834 if (restuple == NULL)
6835 return NULL;
6836 if (!PyTuple_Check(restuple)) {
6837 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6838 Py_DECREF(restuple);
6839 return NULL;
6840 }
6841 if (!PyArg_ParseTuple(restuple, argparse,
6842 &resunicode, newpos)) {
6843 Py_DECREF(restuple);
6844 return NULL;
6845 }
6846 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6847 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6848 Py_DECREF(restuple);
6849 return NULL;
6850 }
6851 if (*newpos<0)
6852 *newpos = len + *newpos;
6853 if (*newpos<0 || *newpos>len) {
6854 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6855 Py_DECREF(restuple);
6856 return NULL;
6857 }
6858 Py_INCREF(resunicode);
6859 Py_DECREF(restuple);
6860 return resunicode;
6861 }
6862
6863 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6864 unicode_encode_ucs1(PyObject *unicode,
6865 const char *errors,
6866 const Py_UCS4 limit)
6867 {
6868 /* input state */
6869 Py_ssize_t pos=0, size;
6870 int kind;
6871 const void *data;
6872 /* pointer into the output */
6873 char *str;
6874 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6875 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6876 PyObject *error_handler_obj = NULL;
6877 PyObject *exc = NULL;
6878 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6879 PyObject *rep = NULL;
6880 /* output object */
6881 _PyBytesWriter writer;
6882
6883 size = PyUnicode_GET_LENGTH(unicode);
6884 kind = PyUnicode_KIND(unicode);
6885 data = PyUnicode_DATA(unicode);
6886 /* allocate enough for a simple encoding without
6887 replacements, if we need more, we'll resize */
6888 if (size == 0)
6889 return PyBytes_FromStringAndSize(NULL, 0);
6890
6891 _PyBytesWriter_Init(&writer);
6892 str = _PyBytesWriter_Alloc(&writer, size);
6893 if (str == NULL)
6894 return NULL;
6895
6896 while (pos < size) {
6897 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6898
6899 /* can we encode this? */
6900 if (ch < limit) {
6901 /* no overflow check, because we know that the space is enough */
6902 *str++ = (char)ch;
6903 ++pos;
6904 }
6905 else {
6906 Py_ssize_t newpos, i;
6907 /* startpos for collecting unencodable chars */
6908 Py_ssize_t collstart = pos;
6909 Py_ssize_t collend = collstart + 1;
6910 /* find all unecodable characters */
6911
6912 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6913 ++collend;
6914
6915 /* Only overallocate the buffer if it's not the last write */
6916 writer.overallocate = (collend < size);
6917
6918 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6919 if (error_handler == _Py_ERROR_UNKNOWN)
6920 error_handler = _Py_GetErrorHandler(errors);
6921
6922 switch (error_handler) {
6923 case _Py_ERROR_STRICT:
6924 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6925 goto onError;
6926
6927 case _Py_ERROR_REPLACE:
6928 memset(str, '?', collend - collstart);
6929 str += (collend - collstart);
6930 /* fall through */
6931 case _Py_ERROR_IGNORE:
6932 pos = collend;
6933 break;
6934
6935 case _Py_ERROR_BACKSLASHREPLACE:
6936 /* subtract preallocated bytes */
6937 writer.min_size -= (collend - collstart);
6938 str = backslashreplace(&writer, str,
6939 unicode, collstart, collend);
6940 if (str == NULL)
6941 goto onError;
6942 pos = collend;
6943 break;
6944
6945 case _Py_ERROR_XMLCHARREFREPLACE:
6946 /* subtract preallocated bytes */
6947 writer.min_size -= (collend - collstart);
6948 str = xmlcharrefreplace(&writer, str,
6949 unicode, collstart, collend);
6950 if (str == NULL)
6951 goto onError;
6952 pos = collend;
6953 break;
6954
6955 case _Py_ERROR_SURROGATEESCAPE:
6956 for (i = collstart; i < collend; ++i) {
6957 ch = PyUnicode_READ(kind, data, i);
6958 if (ch < 0xdc80 || 0xdcff < ch) {
6959 /* Not a UTF-8b surrogate */
6960 break;
6961 }
6962 *str++ = (char)(ch - 0xdc00);
6963 ++pos;
6964 }
6965 if (i >= collend)
6966 break;
6967 collstart = pos;
6968 assert(collstart != collend);
6969 /* fall through */
6970
6971 default:
6972 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6973 encoding, reason, unicode, &exc,
6974 collstart, collend, &newpos);
6975 if (rep == NULL)
6976 goto onError;
6977
6978 if (newpos < collstart) {
6979 writer.overallocate = 1;
6980 str = _PyBytesWriter_Prepare(&writer, str,
6981 collstart - newpos);
6982 if (str == NULL)
6983 goto onError;
6984 }
6985 else {
6986 /* subtract preallocated bytes */
6987 writer.min_size -= newpos - collstart;
6988 /* Only overallocate the buffer if it's not the last write */
6989 writer.overallocate = (newpos < size);
6990 }
6991
6992 if (PyBytes_Check(rep)) {
6993 /* Directly copy bytes result to output. */
6994 str = _PyBytesWriter_WriteBytes(&writer, str,
6995 PyBytes_AS_STRING(rep),
6996 PyBytes_GET_SIZE(rep));
6997 }
6998 else {
6999 assert(PyUnicode_Check(rep));
7000
7001 if (limit == 256 ?
7002 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7003 !PyUnicode_IS_ASCII(rep))
7004 {
7005 /* Not all characters are smaller than limit */
7006 raise_encode_exception(&exc, encoding, unicode,
7007 collstart, collend, reason);
7008 goto onError;
7009 }
7010 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7011 str = _PyBytesWriter_WriteBytes(&writer, str,
7012 PyUnicode_DATA(rep),
7013 PyUnicode_GET_LENGTH(rep));
7014 }
7015 if (str == NULL)
7016 goto onError;
7017
7018 pos = newpos;
7019 Py_CLEAR(rep);
7020 }
7021
7022 /* If overallocation was disabled, ensure that it was the last
7023 write. Otherwise, we missed an optimization */
7024 assert(writer.overallocate || pos == size);
7025 }
7026 }
7027
7028 Py_XDECREF(error_handler_obj);
7029 Py_XDECREF(exc);
7030 return _PyBytesWriter_Finish(&writer, str);
7031
7032 onError:
7033 Py_XDECREF(rep);
7034 _PyBytesWriter_Dealloc(&writer);
7035 Py_XDECREF(error_handler_obj);
7036 Py_XDECREF(exc);
7037 return NULL;
7038 }
7039
7040 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7041 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7042 {
7043 if (!PyUnicode_Check(unicode)) {
7044 PyErr_BadArgument();
7045 return NULL;
7046 }
7047 /* Fast path: if it is a one-byte string, construct
7048 bytes object directly. */
7049 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7050 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7051 PyUnicode_GET_LENGTH(unicode));
7052 /* Non-Latin-1 characters present. Defer to above function to
7053 raise the exception. */
7054 return unicode_encode_ucs1(unicode, errors, 256);
7055 }
7056
7057 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7058 PyUnicode_AsLatin1String(PyObject *unicode)
7059 {
7060 return _PyUnicode_AsLatin1String(unicode, NULL);
7061 }
7062
7063 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7064
7065 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7066 PyUnicode_DecodeASCII(const char *s,
7067 Py_ssize_t size,
7068 const char *errors)
7069 {
7070 const char *starts = s;
7071 const char *e = s + size;
7072 PyObject *error_handler_obj = NULL;
7073 PyObject *exc = NULL;
7074 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7075
7076 if (size == 0)
7077 _Py_RETURN_UNICODE_EMPTY();
7078
7079 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7080 if (size == 1 && (unsigned char)s[0] < 128) {
7081 return get_latin1_char((unsigned char)s[0]);
7082 }
7083
7084 // Shortcut for simple case
7085 PyObject *u = PyUnicode_New(size, 127);
7086 if (u == NULL) {
7087 return NULL;
7088 }
7089 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7090 if (outpos == size) {
7091 return u;
7092 }
7093
7094 _PyUnicodeWriter writer;
7095 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7096 writer.pos = outpos;
7097
7098 s += outpos;
7099 int kind = writer.kind;
7100 void *data = writer.data;
7101 Py_ssize_t startinpos, endinpos;
7102
7103 while (s < e) {
7104 unsigned char c = (unsigned char)*s;
7105 if (c < 128) {
7106 PyUnicode_WRITE(kind, data, writer.pos, c);
7107 writer.pos++;
7108 ++s;
7109 continue;
7110 }
7111
7112 /* byte outsize range 0x00..0x7f: call the error handler */
7113
7114 if (error_handler == _Py_ERROR_UNKNOWN)
7115 error_handler = _Py_GetErrorHandler(errors);
7116
7117 switch (error_handler)
7118 {
7119 case _Py_ERROR_REPLACE:
7120 case _Py_ERROR_SURROGATEESCAPE:
7121 /* Fast-path: the error handler only writes one character,
7122 but we may switch to UCS2 at the first write */
7123 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7124 goto onError;
7125 kind = writer.kind;
7126 data = writer.data;
7127
7128 if (error_handler == _Py_ERROR_REPLACE)
7129 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7130 else
7131 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7132 writer.pos++;
7133 ++s;
7134 break;
7135
7136 case _Py_ERROR_IGNORE:
7137 ++s;
7138 break;
7139
7140 default:
7141 startinpos = s-starts;
7142 endinpos = startinpos + 1;
7143 if (unicode_decode_call_errorhandler_writer(
7144 errors, &error_handler_obj,
7145 "ascii", "ordinal not in range(128)",
7146 &starts, &e, &startinpos, &endinpos, &exc, &s,
7147 &writer))
7148 goto onError;
7149 kind = writer.kind;
7150 data = writer.data;
7151 }
7152 }
7153 Py_XDECREF(error_handler_obj);
7154 Py_XDECREF(exc);
7155 return _PyUnicodeWriter_Finish(&writer);
7156
7157 onError:
7158 _PyUnicodeWriter_Dealloc(&writer);
7159 Py_XDECREF(error_handler_obj);
7160 Py_XDECREF(exc);
7161 return NULL;
7162 }
7163
7164 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7165 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7166 {
7167 if (!PyUnicode_Check(unicode)) {
7168 PyErr_BadArgument();
7169 return NULL;
7170 }
7171 /* Fast path: if it is an ASCII-only string, construct bytes object
7172 directly. Else defer to above function to raise the exception. */
7173 if (PyUnicode_IS_ASCII(unicode))
7174 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7175 PyUnicode_GET_LENGTH(unicode));
7176 return unicode_encode_ucs1(unicode, errors, 128);
7177 }
7178
7179 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7180 PyUnicode_AsASCIIString(PyObject *unicode)
7181 {
7182 return _PyUnicode_AsASCIIString(unicode, NULL);
7183 }
7184
7185 #ifdef MS_WINDOWS
7186
7187 /* --- MBCS codecs for Windows -------------------------------------------- */
7188
7189 #if SIZEOF_INT < SIZEOF_SIZE_T
7190 #define NEED_RETRY
7191 #endif
7192
7193 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7194 transcoding from UTF-16), but INT_MAX / 4 performs better in
7195 both cases also and avoids partial characters overrunning the
7196 length limit in MultiByteToWideChar on Windows */
7197 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7198
7199 #ifndef WC_ERR_INVALID_CHARS
7200 # define WC_ERR_INVALID_CHARS 0x0080
7201 #endif
7202
7203 static const char*
code_page_name(UINT code_page,PyObject ** obj)7204 code_page_name(UINT code_page, PyObject **obj)
7205 {
7206 *obj = NULL;
7207 if (code_page == CP_ACP)
7208 return "mbcs";
7209 if (code_page == CP_UTF7)
7210 return "CP_UTF7";
7211 if (code_page == CP_UTF8)
7212 return "CP_UTF8";
7213
7214 *obj = PyBytes_FromFormat("cp%u", code_page);
7215 if (*obj == NULL)
7216 return NULL;
7217 return PyBytes_AS_STRING(*obj);
7218 }
7219
7220 static DWORD
decode_code_page_flags(UINT code_page)7221 decode_code_page_flags(UINT code_page)
7222 {
7223 if (code_page == CP_UTF7) {
7224 /* The CP_UTF7 decoder only supports flags=0 */
7225 return 0;
7226 }
7227 else
7228 return MB_ERR_INVALID_CHARS;
7229 }
7230
7231 /*
7232 * Decode a byte string from a Windows code page into unicode object in strict
7233 * mode.
7234 *
7235 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7236 * OSError and returns -1 on other error.
7237 */
7238 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7239 decode_code_page_strict(UINT code_page,
7240 wchar_t **buf,
7241 Py_ssize_t *bufsize,
7242 const char *in,
7243 int insize)
7244 {
7245 DWORD flags = MB_ERR_INVALID_CHARS;
7246 wchar_t *out;
7247 DWORD outsize;
7248
7249 /* First get the size of the result */
7250 assert(insize > 0);
7251 while ((outsize = MultiByteToWideChar(code_page, flags,
7252 in, insize, NULL, 0)) <= 0)
7253 {
7254 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7255 goto error;
7256 }
7257 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7258 flags = 0;
7259 }
7260
7261 /* Extend a wchar_t* buffer */
7262 Py_ssize_t n = *bufsize; /* Get the current length */
7263 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7264 return -1;
7265 }
7266 out = *buf + n;
7267
7268 /* Do the conversion */
7269 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7270 if (outsize <= 0)
7271 goto error;
7272 return insize;
7273
7274 error:
7275 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7276 return -2;
7277 PyErr_SetFromWindowsErr(0);
7278 return -1;
7279 }
7280
7281 /*
7282 * Decode a byte string from a code page into unicode object with an error
7283 * handler.
7284 *
7285 * Returns consumed size if succeed, or raise an OSError or
7286 * UnicodeDecodeError exception and returns -1 on error.
7287 */
7288 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7289 decode_code_page_errors(UINT code_page,
7290 wchar_t **buf,
7291 Py_ssize_t *bufsize,
7292 const char *in, const int size,
7293 const char *errors, int final)
7294 {
7295 const char *startin = in;
7296 const char *endin = in + size;
7297 DWORD flags = MB_ERR_INVALID_CHARS;
7298 /* Ideally, we should get reason from FormatMessage. This is the Windows
7299 2000 English version of the message. */
7300 const char *reason = "No mapping for the Unicode character exists "
7301 "in the target code page.";
7302 /* each step cannot decode more than 1 character, but a character can be
7303 represented as a surrogate pair */
7304 wchar_t buffer[2], *out;
7305 int insize;
7306 Py_ssize_t outsize;
7307 PyObject *errorHandler = NULL;
7308 PyObject *exc = NULL;
7309 PyObject *encoding_obj = NULL;
7310 const char *encoding;
7311 DWORD err;
7312 int ret = -1;
7313
7314 assert(size > 0);
7315
7316 encoding = code_page_name(code_page, &encoding_obj);
7317 if (encoding == NULL)
7318 return -1;
7319
7320 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7321 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7322 UnicodeDecodeError. */
7323 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7324 if (exc != NULL) {
7325 PyCodec_StrictErrors(exc);
7326 Py_CLEAR(exc);
7327 }
7328 goto error;
7329 }
7330
7331 /* Extend a wchar_t* buffer */
7332 Py_ssize_t n = *bufsize; /* Get the current length */
7333 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7334 PyErr_NoMemory();
7335 goto error;
7336 }
7337 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7338 goto error;
7339 }
7340 out = *buf + n;
7341
7342 /* Decode the byte string character per character */
7343 while (in < endin)
7344 {
7345 /* Decode a character */
7346 insize = 1;
7347 do
7348 {
7349 outsize = MultiByteToWideChar(code_page, flags,
7350 in, insize,
7351 buffer, Py_ARRAY_LENGTH(buffer));
7352 if (outsize > 0)
7353 break;
7354 err = GetLastError();
7355 if (err == ERROR_INVALID_FLAGS && flags) {
7356 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7357 flags = 0;
7358 continue;
7359 }
7360 if (err != ERROR_NO_UNICODE_TRANSLATION
7361 && err != ERROR_INSUFFICIENT_BUFFER)
7362 {
7363 PyErr_SetFromWindowsErr(err);
7364 goto error;
7365 }
7366 insize++;
7367 }
7368 /* 4=maximum length of a UTF-8 sequence */
7369 while (insize <= 4 && (in + insize) <= endin);
7370
7371 if (outsize <= 0) {
7372 Py_ssize_t startinpos, endinpos, outpos;
7373
7374 /* last character in partial decode? */
7375 if (in + insize >= endin && !final)
7376 break;
7377
7378 startinpos = in - startin;
7379 endinpos = startinpos + 1;
7380 outpos = out - *buf;
7381 if (unicode_decode_call_errorhandler_wchar(
7382 errors, &errorHandler,
7383 encoding, reason,
7384 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7385 buf, bufsize, &outpos))
7386 {
7387 goto error;
7388 }
7389 out = *buf + outpos;
7390 }
7391 else {
7392 in += insize;
7393 memcpy(out, buffer, outsize * sizeof(wchar_t));
7394 out += outsize;
7395 }
7396 }
7397
7398 /* Shrink the buffer */
7399 assert(out - *buf <= *bufsize);
7400 *bufsize = out - *buf;
7401 /* (in - startin) <= size and size is an int */
7402 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7403
7404 error:
7405 Py_XDECREF(encoding_obj);
7406 Py_XDECREF(errorHandler);
7407 Py_XDECREF(exc);
7408 return ret;
7409 }
7410
7411 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7412 decode_code_page_stateful(int code_page,
7413 const char *s, Py_ssize_t size,
7414 const char *errors, Py_ssize_t *consumed)
7415 {
7416 wchar_t *buf = NULL;
7417 Py_ssize_t bufsize = 0;
7418 int chunk_size, final, converted, done;
7419
7420 if (code_page < 0) {
7421 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7422 return NULL;
7423 }
7424 if (size < 0) {
7425 PyErr_BadInternalCall();
7426 return NULL;
7427 }
7428
7429 if (consumed)
7430 *consumed = 0;
7431
7432 do
7433 {
7434 #ifdef NEED_RETRY
7435 if (size > DECODING_CHUNK_SIZE) {
7436 chunk_size = DECODING_CHUNK_SIZE;
7437 final = 0;
7438 done = 0;
7439 }
7440 else
7441 #endif
7442 {
7443 chunk_size = (int)size;
7444 final = (consumed == NULL);
7445 done = 1;
7446 }
7447
7448 if (chunk_size == 0 && done) {
7449 if (buf != NULL)
7450 break;
7451 _Py_RETURN_UNICODE_EMPTY();
7452 }
7453
7454 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7455 s, chunk_size);
7456 if (converted == -2)
7457 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7458 s, chunk_size,
7459 errors, final);
7460 assert(converted != 0 || done);
7461
7462 if (converted < 0) {
7463 PyMem_Free(buf);
7464 return NULL;
7465 }
7466
7467 if (consumed)
7468 *consumed += converted;
7469
7470 s += converted;
7471 size -= converted;
7472 } while (!done);
7473
7474 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7475 PyMem_Free(buf);
7476 return v;
7477 }
7478
7479 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7480 PyUnicode_DecodeCodePageStateful(int code_page,
7481 const char *s,
7482 Py_ssize_t size,
7483 const char *errors,
7484 Py_ssize_t *consumed)
7485 {
7486 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7487 }
7488
7489 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7490 PyUnicode_DecodeMBCSStateful(const char *s,
7491 Py_ssize_t size,
7492 const char *errors,
7493 Py_ssize_t *consumed)
7494 {
7495 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7496 }
7497
7498 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7499 PyUnicode_DecodeMBCS(const char *s,
7500 Py_ssize_t size,
7501 const char *errors)
7502 {
7503 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7504 }
7505
7506 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7507 encode_code_page_flags(UINT code_page, const char *errors)
7508 {
7509 if (code_page == CP_UTF8) {
7510 return WC_ERR_INVALID_CHARS;
7511 }
7512 else if (code_page == CP_UTF7) {
7513 /* CP_UTF7 only supports flags=0 */
7514 return 0;
7515 }
7516 else {
7517 if (errors != NULL && strcmp(errors, "replace") == 0)
7518 return 0;
7519 else
7520 return WC_NO_BEST_FIT_CHARS;
7521 }
7522 }
7523
7524 /*
7525 * Encode a Unicode string to a Windows code page into a byte string in strict
7526 * mode.
7527 *
7528 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7529 * an OSError and returns -1 on other error.
7530 */
7531 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7532 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7533 PyObject *unicode, Py_ssize_t offset, int len,
7534 const char* errors)
7535 {
7536 BOOL usedDefaultChar = FALSE;
7537 BOOL *pusedDefaultChar = &usedDefaultChar;
7538 int outsize;
7539 wchar_t *p;
7540 Py_ssize_t size;
7541 const DWORD flags = encode_code_page_flags(code_page, NULL);
7542 char *out;
7543 /* Create a substring so that we can get the UTF-16 representation
7544 of just the slice under consideration. */
7545 PyObject *substring;
7546 int ret = -1;
7547
7548 assert(len > 0);
7549
7550 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7551 pusedDefaultChar = &usedDefaultChar;
7552 else
7553 pusedDefaultChar = NULL;
7554
7555 substring = PyUnicode_Substring(unicode, offset, offset+len);
7556 if (substring == NULL)
7557 return -1;
7558 p = PyUnicode_AsWideCharString(substring, &size);
7559 Py_CLEAR(substring);
7560 if (p == NULL) {
7561 return -1;
7562 }
7563 assert(size <= INT_MAX);
7564
7565 /* First get the size of the result */
7566 outsize = WideCharToMultiByte(code_page, flags,
7567 p, (int)size,
7568 NULL, 0,
7569 NULL, pusedDefaultChar);
7570 if (outsize <= 0)
7571 goto error;
7572 /* If we used a default char, then we failed! */
7573 if (pusedDefaultChar && *pusedDefaultChar) {
7574 ret = -2;
7575 goto done;
7576 }
7577
7578 if (*outbytes == NULL) {
7579 /* Create string object */
7580 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7581 if (*outbytes == NULL) {
7582 goto done;
7583 }
7584 out = PyBytes_AS_STRING(*outbytes);
7585 }
7586 else {
7587 /* Extend string object */
7588 const Py_ssize_t n = PyBytes_Size(*outbytes);
7589 if (outsize > PY_SSIZE_T_MAX - n) {
7590 PyErr_NoMemory();
7591 goto done;
7592 }
7593 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7594 goto done;
7595 }
7596 out = PyBytes_AS_STRING(*outbytes) + n;
7597 }
7598
7599 /* Do the conversion */
7600 outsize = WideCharToMultiByte(code_page, flags,
7601 p, (int)size,
7602 out, outsize,
7603 NULL, pusedDefaultChar);
7604 if (outsize <= 0)
7605 goto error;
7606 if (pusedDefaultChar && *pusedDefaultChar) {
7607 ret = -2;
7608 goto done;
7609 }
7610 ret = 0;
7611
7612 done:
7613 PyMem_Free(p);
7614 return ret;
7615
7616 error:
7617 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7618 ret = -2;
7619 goto done;
7620 }
7621 PyErr_SetFromWindowsErr(0);
7622 goto done;
7623 }
7624
7625 /*
7626 * Encode a Unicode string to a Windows code page into a byte string using an
7627 * error handler.
7628 *
7629 * Returns consumed characters if succeed, or raise an OSError and returns
7630 * -1 on other error.
7631 */
7632 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7633 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7634 PyObject *unicode, Py_ssize_t unicode_offset,
7635 Py_ssize_t insize, const char* errors)
7636 {
7637 const DWORD flags = encode_code_page_flags(code_page, errors);
7638 Py_ssize_t pos = unicode_offset;
7639 Py_ssize_t endin = unicode_offset + insize;
7640 /* Ideally, we should get reason from FormatMessage. This is the Windows
7641 2000 English version of the message. */
7642 const char *reason = "invalid character";
7643 /* 4=maximum length of a UTF-8 sequence */
7644 char buffer[4];
7645 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7646 Py_ssize_t outsize;
7647 char *out;
7648 PyObject *errorHandler = NULL;
7649 PyObject *exc = NULL;
7650 PyObject *encoding_obj = NULL;
7651 const char *encoding;
7652 Py_ssize_t newpos, newoutsize;
7653 PyObject *rep;
7654 int ret = -1;
7655
7656 assert(insize > 0);
7657
7658 encoding = code_page_name(code_page, &encoding_obj);
7659 if (encoding == NULL)
7660 return -1;
7661
7662 if (errors == NULL || strcmp(errors, "strict") == 0) {
7663 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7664 then we raise a UnicodeEncodeError. */
7665 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7666 if (exc != NULL) {
7667 PyCodec_StrictErrors(exc);
7668 Py_DECREF(exc);
7669 }
7670 Py_XDECREF(encoding_obj);
7671 return -1;
7672 }
7673
7674 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7675 pusedDefaultChar = &usedDefaultChar;
7676 else
7677 pusedDefaultChar = NULL;
7678
7679 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7680 PyErr_NoMemory();
7681 goto error;
7682 }
7683 outsize = insize * Py_ARRAY_LENGTH(buffer);
7684
7685 if (*outbytes == NULL) {
7686 /* Create string object */
7687 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7688 if (*outbytes == NULL)
7689 goto error;
7690 out = PyBytes_AS_STRING(*outbytes);
7691 }
7692 else {
7693 /* Extend string object */
7694 Py_ssize_t n = PyBytes_Size(*outbytes);
7695 if (n > PY_SSIZE_T_MAX - outsize) {
7696 PyErr_NoMemory();
7697 goto error;
7698 }
7699 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7700 goto error;
7701 out = PyBytes_AS_STRING(*outbytes) + n;
7702 }
7703
7704 /* Encode the string character per character */
7705 while (pos < endin)
7706 {
7707 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7708 wchar_t chars[2];
7709 int charsize;
7710 if (ch < 0x10000) {
7711 chars[0] = (wchar_t)ch;
7712 charsize = 1;
7713 }
7714 else {
7715 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7716 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7717 charsize = 2;
7718 }
7719
7720 outsize = WideCharToMultiByte(code_page, flags,
7721 chars, charsize,
7722 buffer, Py_ARRAY_LENGTH(buffer),
7723 NULL, pusedDefaultChar);
7724 if (outsize > 0) {
7725 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7726 {
7727 pos++;
7728 memcpy(out, buffer, outsize);
7729 out += outsize;
7730 continue;
7731 }
7732 }
7733 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7734 PyErr_SetFromWindowsErr(0);
7735 goto error;
7736 }
7737
7738 rep = unicode_encode_call_errorhandler(
7739 errors, &errorHandler, encoding, reason,
7740 unicode, &exc,
7741 pos, pos + 1, &newpos);
7742 if (rep == NULL)
7743 goto error;
7744
7745 Py_ssize_t morebytes = pos - newpos;
7746 if (PyBytes_Check(rep)) {
7747 outsize = PyBytes_GET_SIZE(rep);
7748 morebytes += outsize;
7749 if (morebytes > 0) {
7750 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7751 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7752 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7753 Py_DECREF(rep);
7754 goto error;
7755 }
7756 out = PyBytes_AS_STRING(*outbytes) + offset;
7757 }
7758 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7759 out += outsize;
7760 }
7761 else {
7762 Py_ssize_t i;
7763 int kind;
7764 const void *data;
7765
7766 outsize = PyUnicode_GET_LENGTH(rep);
7767 morebytes += outsize;
7768 if (morebytes > 0) {
7769 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7770 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7771 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7772 Py_DECREF(rep);
7773 goto error;
7774 }
7775 out = PyBytes_AS_STRING(*outbytes) + offset;
7776 }
7777 kind = PyUnicode_KIND(rep);
7778 data = PyUnicode_DATA(rep);
7779 for (i=0; i < outsize; i++) {
7780 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7781 if (ch > 127) {
7782 raise_encode_exception(&exc,
7783 encoding, unicode,
7784 pos, pos + 1,
7785 "unable to encode error handler result to ASCII");
7786 Py_DECREF(rep);
7787 goto error;
7788 }
7789 *out = (unsigned char)ch;
7790 out++;
7791 }
7792 }
7793 pos = newpos;
7794 Py_DECREF(rep);
7795 }
7796 /* write a NUL byte */
7797 *out = 0;
7798 outsize = out - PyBytes_AS_STRING(*outbytes);
7799 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7800 if (_PyBytes_Resize(outbytes, outsize) < 0)
7801 goto error;
7802 ret = 0;
7803
7804 error:
7805 Py_XDECREF(encoding_obj);
7806 Py_XDECREF(errorHandler);
7807 Py_XDECREF(exc);
7808 return ret;
7809 }
7810
7811 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7812 encode_code_page(int code_page,
7813 PyObject *unicode,
7814 const char *errors)
7815 {
7816 Py_ssize_t len;
7817 PyObject *outbytes = NULL;
7818 Py_ssize_t offset;
7819 int chunk_len, ret, done;
7820
7821 if (!PyUnicode_Check(unicode)) {
7822 PyErr_BadArgument();
7823 return NULL;
7824 }
7825
7826 len = PyUnicode_GET_LENGTH(unicode);
7827
7828 if (code_page < 0) {
7829 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7830 return NULL;
7831 }
7832
7833 if (len == 0)
7834 return PyBytes_FromStringAndSize(NULL, 0);
7835
7836 offset = 0;
7837 do
7838 {
7839 #ifdef NEED_RETRY
7840 if (len > DECODING_CHUNK_SIZE) {
7841 chunk_len = DECODING_CHUNK_SIZE;
7842 done = 0;
7843 }
7844 else
7845 #endif
7846 {
7847 chunk_len = (int)len;
7848 done = 1;
7849 }
7850
7851 ret = encode_code_page_strict(code_page, &outbytes,
7852 unicode, offset, chunk_len,
7853 errors);
7854 if (ret == -2)
7855 ret = encode_code_page_errors(code_page, &outbytes,
7856 unicode, offset,
7857 chunk_len, errors);
7858 if (ret < 0) {
7859 Py_XDECREF(outbytes);
7860 return NULL;
7861 }
7862
7863 offset += chunk_len;
7864 len -= chunk_len;
7865 } while (!done);
7866
7867 return outbytes;
7868 }
7869
7870 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7871 PyUnicode_EncodeCodePage(int code_page,
7872 PyObject *unicode,
7873 const char *errors)
7874 {
7875 return encode_code_page(code_page, unicode, errors);
7876 }
7877
7878 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7879 PyUnicode_AsMBCSString(PyObject *unicode)
7880 {
7881 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7882 }
7883
7884 #undef NEED_RETRY
7885
7886 #endif /* MS_WINDOWS */
7887
7888 /* --- Character Mapping Codec -------------------------------------------- */
7889
7890 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7891 charmap_decode_string(const char *s,
7892 Py_ssize_t size,
7893 PyObject *mapping,
7894 const char *errors,
7895 _PyUnicodeWriter *writer)
7896 {
7897 const char *starts = s;
7898 const char *e;
7899 Py_ssize_t startinpos, endinpos;
7900 PyObject *errorHandler = NULL, *exc = NULL;
7901 Py_ssize_t maplen;
7902 int mapkind;
7903 const void *mapdata;
7904 Py_UCS4 x;
7905 unsigned char ch;
7906
7907 maplen = PyUnicode_GET_LENGTH(mapping);
7908 mapdata = PyUnicode_DATA(mapping);
7909 mapkind = PyUnicode_KIND(mapping);
7910
7911 e = s + size;
7912
7913 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7914 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7915 * is disabled in encoding aliases, latin1 is preferred because
7916 * its implementation is faster. */
7917 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
7918 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7919 Py_UCS4 maxchar = writer->maxchar;
7920
7921 assert (writer->kind == PyUnicode_1BYTE_KIND);
7922 while (s < e) {
7923 ch = *s;
7924 x = mapdata_ucs1[ch];
7925 if (x > maxchar) {
7926 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7927 goto onError;
7928 maxchar = writer->maxchar;
7929 outdata = (Py_UCS1 *)writer->data;
7930 }
7931 outdata[writer->pos] = x;
7932 writer->pos++;
7933 ++s;
7934 }
7935 return 0;
7936 }
7937
7938 while (s < e) {
7939 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7940 int outkind = writer->kind;
7941 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
7942 if (outkind == PyUnicode_1BYTE_KIND) {
7943 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7944 Py_UCS4 maxchar = writer->maxchar;
7945 while (s < e) {
7946 ch = *s;
7947 x = mapdata_ucs2[ch];
7948 if (x > maxchar)
7949 goto Error;
7950 outdata[writer->pos] = x;
7951 writer->pos++;
7952 ++s;
7953 }
7954 break;
7955 }
7956 else if (outkind == PyUnicode_2BYTE_KIND) {
7957 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7958 while (s < e) {
7959 ch = *s;
7960 x = mapdata_ucs2[ch];
7961 if (x == 0xFFFE)
7962 goto Error;
7963 outdata[writer->pos] = x;
7964 writer->pos++;
7965 ++s;
7966 }
7967 break;
7968 }
7969 }
7970 ch = *s;
7971
7972 if (ch < maplen)
7973 x = PyUnicode_READ(mapkind, mapdata, ch);
7974 else
7975 x = 0xfffe; /* invalid value */
7976 Error:
7977 if (x == 0xfffe)
7978 {
7979 /* undefined mapping */
7980 startinpos = s-starts;
7981 endinpos = startinpos+1;
7982 if (unicode_decode_call_errorhandler_writer(
7983 errors, &errorHandler,
7984 "charmap", "character maps to <undefined>",
7985 &starts, &e, &startinpos, &endinpos, &exc, &s,
7986 writer)) {
7987 goto onError;
7988 }
7989 continue;
7990 }
7991
7992 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7993 goto onError;
7994 ++s;
7995 }
7996 Py_XDECREF(errorHandler);
7997 Py_XDECREF(exc);
7998 return 0;
7999
8000 onError:
8001 Py_XDECREF(errorHandler);
8002 Py_XDECREF(exc);
8003 return -1;
8004 }
8005
8006 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8007 charmap_decode_mapping(const char *s,
8008 Py_ssize_t size,
8009 PyObject *mapping,
8010 const char *errors,
8011 _PyUnicodeWriter *writer)
8012 {
8013 const char *starts = s;
8014 const char *e;
8015 Py_ssize_t startinpos, endinpos;
8016 PyObject *errorHandler = NULL, *exc = NULL;
8017 unsigned char ch;
8018 PyObject *key, *item = NULL;
8019
8020 e = s + size;
8021
8022 while (s < e) {
8023 ch = *s;
8024
8025 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8026 key = PyLong_FromLong((long)ch);
8027 if (key == NULL)
8028 goto onError;
8029
8030 item = PyObject_GetItem(mapping, key);
8031 Py_DECREF(key);
8032 if (item == NULL) {
8033 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8034 /* No mapping found means: mapping is undefined. */
8035 PyErr_Clear();
8036 goto Undefined;
8037 } else
8038 goto onError;
8039 }
8040
8041 /* Apply mapping */
8042 if (item == Py_None)
8043 goto Undefined;
8044 if (PyLong_Check(item)) {
8045 long value = PyLong_AS_LONG(item);
8046 if (value == 0xFFFE)
8047 goto Undefined;
8048 if (value < 0 || value > MAX_UNICODE) {
8049 PyErr_Format(PyExc_TypeError,
8050 "character mapping must be in range(0x%x)",
8051 (unsigned long)MAX_UNICODE + 1);
8052 goto onError;
8053 }
8054
8055 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8056 goto onError;
8057 }
8058 else if (PyUnicode_Check(item)) {
8059 if (PyUnicode_GET_LENGTH(item) == 1) {
8060 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8061 if (value == 0xFFFE)
8062 goto Undefined;
8063 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8064 goto onError;
8065 }
8066 else {
8067 writer->overallocate = 1;
8068 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8069 goto onError;
8070 }
8071 }
8072 else {
8073 /* wrong return value */
8074 PyErr_SetString(PyExc_TypeError,
8075 "character mapping must return integer, None or str");
8076 goto onError;
8077 }
8078 Py_CLEAR(item);
8079 ++s;
8080 continue;
8081
8082 Undefined:
8083 /* undefined mapping */
8084 Py_CLEAR(item);
8085 startinpos = s-starts;
8086 endinpos = startinpos+1;
8087 if (unicode_decode_call_errorhandler_writer(
8088 errors, &errorHandler,
8089 "charmap", "character maps to <undefined>",
8090 &starts, &e, &startinpos, &endinpos, &exc, &s,
8091 writer)) {
8092 goto onError;
8093 }
8094 }
8095 Py_XDECREF(errorHandler);
8096 Py_XDECREF(exc);
8097 return 0;
8098
8099 onError:
8100 Py_XDECREF(item);
8101 Py_XDECREF(errorHandler);
8102 Py_XDECREF(exc);
8103 return -1;
8104 }
8105
8106 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8107 PyUnicode_DecodeCharmap(const char *s,
8108 Py_ssize_t size,
8109 PyObject *mapping,
8110 const char *errors)
8111 {
8112 _PyUnicodeWriter writer;
8113
8114 /* Default to Latin-1 */
8115 if (mapping == NULL)
8116 return PyUnicode_DecodeLatin1(s, size, errors);
8117
8118 if (size == 0)
8119 _Py_RETURN_UNICODE_EMPTY();
8120 _PyUnicodeWriter_Init(&writer);
8121 writer.min_length = size;
8122 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8123 goto onError;
8124
8125 if (PyUnicode_CheckExact(mapping)) {
8126 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8127 goto onError;
8128 }
8129 else {
8130 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8131 goto onError;
8132 }
8133 return _PyUnicodeWriter_Finish(&writer);
8134
8135 onError:
8136 _PyUnicodeWriter_Dealloc(&writer);
8137 return NULL;
8138 }
8139
8140 /* Charmap encoding: the lookup table */
8141
8142 /*[clinic input]
8143 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8144 [clinic start generated code]*/
8145 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8146
8147 struct encoding_map {
8148 PyObject_HEAD
8149 unsigned char level1[32];
8150 int count2, count3;
8151 unsigned char level23[1];
8152 };
8153
8154 /*[clinic input]
8155 EncodingMap.size
8156
8157 Return the size (in bytes) of this object.
8158 [clinic start generated code]*/
8159
8160 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8161 EncodingMap_size_impl(struct encoding_map *self)
8162 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8163 {
8164 return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8165 128*self->count3);
8166 }
8167
8168 static PyMethodDef encoding_map_methods[] = {
8169 ENCODINGMAP_SIZE_METHODDEF
8170 {NULL, NULL}
8171 };
8172
8173 static PyTypeObject EncodingMapType = {
8174 PyVarObject_HEAD_INIT(NULL, 0)
8175 .tp_name = "EncodingMap",
8176 .tp_basicsize = sizeof(struct encoding_map),
8177 /* methods */
8178 .tp_flags = Py_TPFLAGS_DEFAULT,
8179 .tp_methods = encoding_map_methods,
8180 };
8181
8182 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8183 PyUnicode_BuildEncodingMap(PyObject* string)
8184 {
8185 PyObject *result;
8186 struct encoding_map *mresult;
8187 int i;
8188 int need_dict = 0;
8189 unsigned char level1[32];
8190 unsigned char level2[512];
8191 unsigned char *mlevel1, *mlevel2, *mlevel3;
8192 int count2 = 0, count3 = 0;
8193 int kind;
8194 const void *data;
8195 Py_ssize_t length;
8196 Py_UCS4 ch;
8197
8198 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8199 PyErr_BadArgument();
8200 return NULL;
8201 }
8202 kind = PyUnicode_KIND(string);
8203 data = PyUnicode_DATA(string);
8204 length = PyUnicode_GET_LENGTH(string);
8205 length = Py_MIN(length, 256);
8206 memset(level1, 0xFF, sizeof level1);
8207 memset(level2, 0xFF, sizeof level2);
8208
8209 /* If there isn't a one-to-one mapping of NULL to \0,
8210 or if there are non-BMP characters, we need to use
8211 a mapping dictionary. */
8212 if (PyUnicode_READ(kind, data, 0) != 0)
8213 need_dict = 1;
8214 for (i = 1; i < length; i++) {
8215 int l1, l2;
8216 ch = PyUnicode_READ(kind, data, i);
8217 if (ch == 0 || ch > 0xFFFF) {
8218 need_dict = 1;
8219 break;
8220 }
8221 if (ch == 0xFFFE)
8222 /* unmapped character */
8223 continue;
8224 l1 = ch >> 11;
8225 l2 = ch >> 7;
8226 if (level1[l1] == 0xFF)
8227 level1[l1] = count2++;
8228 if (level2[l2] == 0xFF)
8229 level2[l2] = count3++;
8230 }
8231
8232 if (count2 >= 0xFF || count3 >= 0xFF)
8233 need_dict = 1;
8234
8235 if (need_dict) {
8236 PyObject *result = PyDict_New();
8237 if (!result)
8238 return NULL;
8239 for (i = 0; i < length; i++) {
8240 Py_UCS4 c = PyUnicode_READ(kind, data, i);
8241 PyObject *key = PyLong_FromLong(c);
8242 if (key == NULL) {
8243 Py_DECREF(result);
8244 return NULL;
8245 }
8246 PyObject *value = PyLong_FromLong(i);
8247 if (value == NULL) {
8248 Py_DECREF(key);
8249 Py_DECREF(result);
8250 return NULL;
8251 }
8252 int rc = PyDict_SetItem(result, key, value);
8253 Py_DECREF(key);
8254 Py_DECREF(value);
8255 if (rc < 0) {
8256 Py_DECREF(result);
8257 return NULL;
8258 }
8259 }
8260 return result;
8261 }
8262
8263 /* Create a three-level trie */
8264 result = PyObject_Malloc(sizeof(struct encoding_map) +
8265 16*count2 + 128*count3 - 1);
8266 if (!result) {
8267 return PyErr_NoMemory();
8268 }
8269
8270 _PyObject_Init(result, &EncodingMapType);
8271 mresult = (struct encoding_map*)result;
8272 mresult->count2 = count2;
8273 mresult->count3 = count3;
8274 mlevel1 = mresult->level1;
8275 mlevel2 = mresult->level23;
8276 mlevel3 = mresult->level23 + 16*count2;
8277 memcpy(mlevel1, level1, 32);
8278 memset(mlevel2, 0xFF, 16*count2);
8279 memset(mlevel3, 0, 128*count3);
8280 count3 = 0;
8281 for (i = 1; i < length; i++) {
8282 int o1, o2, o3, i2, i3;
8283 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8284 if (ch == 0xFFFE)
8285 /* unmapped character */
8286 continue;
8287 o1 = ch>>11;
8288 o2 = (ch>>7) & 0xF;
8289 i2 = 16*mlevel1[o1] + o2;
8290 if (mlevel2[i2] == 0xFF)
8291 mlevel2[i2] = count3++;
8292 o3 = ch & 0x7F;
8293 i3 = 128*mlevel2[i2] + o3;
8294 mlevel3[i3] = i;
8295 }
8296 return result;
8297 }
8298
8299 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8300 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8301 {
8302 struct encoding_map *map = (struct encoding_map*)mapping;
8303 int l1 = c>>11;
8304 int l2 = (c>>7) & 0xF;
8305 int l3 = c & 0x7F;
8306 int i;
8307
8308 if (c > 0xFFFF)
8309 return -1;
8310 if (c == 0)
8311 return 0;
8312 /* level 1*/
8313 i = map->level1[l1];
8314 if (i == 0xFF) {
8315 return -1;
8316 }
8317 /* level 2*/
8318 i = map->level23[16*i+l2];
8319 if (i == 0xFF) {
8320 return -1;
8321 }
8322 /* level 3 */
8323 i = map->level23[16*map->count2 + 128*i + l3];
8324 if (i == 0) {
8325 return -1;
8326 }
8327 return i;
8328 }
8329
8330 /* Lookup the character ch in the mapping. If the character
8331 can't be found, Py_None is returned (or NULL, if another
8332 error occurred). */
8333 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8334 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8335 {
8336 PyObject *w = PyLong_FromLong((long)c);
8337 PyObject *x;
8338
8339 if (w == NULL)
8340 return NULL;
8341 x = PyObject_GetItem(mapping, w);
8342 Py_DECREF(w);
8343 if (x == NULL) {
8344 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345 /* No mapping found means: mapping is undefined. */
8346 PyErr_Clear();
8347 Py_RETURN_NONE;
8348 } else
8349 return NULL;
8350 }
8351 else if (x == Py_None)
8352 return x;
8353 else if (PyLong_Check(x)) {
8354 long value = PyLong_AS_LONG(x);
8355 if (value < 0 || value > 255) {
8356 PyErr_SetString(PyExc_TypeError,
8357 "character mapping must be in range(256)");
8358 Py_DECREF(x);
8359 return NULL;
8360 }
8361 return x;
8362 }
8363 else if (PyBytes_Check(x))
8364 return x;
8365 else {
8366 /* wrong return value */
8367 PyErr_Format(PyExc_TypeError,
8368 "character mapping must return integer, bytes or None, not %.400s",
8369 Py_TYPE(x)->tp_name);
8370 Py_DECREF(x);
8371 return NULL;
8372 }
8373 }
8374
8375 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8376 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8377 {
8378 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8379 /* exponentially overallocate to minimize reallocations */
8380 if (requiredsize < 2*outsize)
8381 requiredsize = 2*outsize;
8382 if (_PyBytes_Resize(outobj, requiredsize))
8383 return -1;
8384 return 0;
8385 }
8386
8387 typedef enum charmapencode_result {
8388 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8389 } charmapencode_result;
8390 /* lookup the character, put the result in the output string and adjust
8391 various state variables. Resize the output bytes object if not enough
8392 space is available. Return a new reference to the object that
8393 was put in the output buffer, or Py_None, if the mapping was undefined
8394 (in which case no character was written) or NULL, if a
8395 reallocation error occurred. The caller must decref the result */
8396 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8397 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8398 PyObject **outobj, Py_ssize_t *outpos)
8399 {
8400 PyObject *rep;
8401 char *outstart;
8402 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8403
8404 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8405 int res = encoding_map_lookup(c, mapping);
8406 Py_ssize_t requiredsize = *outpos+1;
8407 if (res == -1)
8408 return enc_FAILED;
8409 if (outsize<requiredsize)
8410 if (charmapencode_resize(outobj, outpos, requiredsize))
8411 return enc_EXCEPTION;
8412 outstart = PyBytes_AS_STRING(*outobj);
8413 outstart[(*outpos)++] = (char)res;
8414 return enc_SUCCESS;
8415 }
8416
8417 rep = charmapencode_lookup(c, mapping);
8418 if (rep==NULL)
8419 return enc_EXCEPTION;
8420 else if (rep==Py_None) {
8421 Py_DECREF(rep);
8422 return enc_FAILED;
8423 } else {
8424 if (PyLong_Check(rep)) {
8425 Py_ssize_t requiredsize = *outpos+1;
8426 if (outsize<requiredsize)
8427 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8428 Py_DECREF(rep);
8429 return enc_EXCEPTION;
8430 }
8431 outstart = PyBytes_AS_STRING(*outobj);
8432 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8433 }
8434 else {
8435 const char *repchars = PyBytes_AS_STRING(rep);
8436 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8437 Py_ssize_t requiredsize = *outpos+repsize;
8438 if (outsize<requiredsize)
8439 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8440 Py_DECREF(rep);
8441 return enc_EXCEPTION;
8442 }
8443 outstart = PyBytes_AS_STRING(*outobj);
8444 memcpy(outstart + *outpos, repchars, repsize);
8445 *outpos += repsize;
8446 }
8447 }
8448 Py_DECREF(rep);
8449 return enc_SUCCESS;
8450 }
8451
8452 /* handle an error in PyUnicode_EncodeCharmap
8453 Return 0 on success, -1 on error */
8454 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8455 charmap_encoding_error(
8456 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8457 PyObject **exceptionObject,
8458 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8459 PyObject **res, Py_ssize_t *respos)
8460 {
8461 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8462 Py_ssize_t size, repsize;
8463 Py_ssize_t newpos;
8464 int kind;
8465 const void *data;
8466 Py_ssize_t index;
8467 /* startpos for collecting unencodable chars */
8468 Py_ssize_t collstartpos = *inpos;
8469 Py_ssize_t collendpos = *inpos+1;
8470 Py_ssize_t collpos;
8471 const char *encoding = "charmap";
8472 const char *reason = "character maps to <undefined>";
8473 charmapencode_result x;
8474 Py_UCS4 ch;
8475 int val;
8476
8477 size = PyUnicode_GET_LENGTH(unicode);
8478 /* find all unencodable characters */
8479 while (collendpos < size) {
8480 PyObject *rep;
8481 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8482 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8483 val = encoding_map_lookup(ch, mapping);
8484 if (val != -1)
8485 break;
8486 ++collendpos;
8487 continue;
8488 }
8489
8490 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8491 rep = charmapencode_lookup(ch, mapping);
8492 if (rep==NULL)
8493 return -1;
8494 else if (rep!=Py_None) {
8495 Py_DECREF(rep);
8496 break;
8497 }
8498 Py_DECREF(rep);
8499 ++collendpos;
8500 }
8501 /* cache callback name lookup
8502 * (if not done yet, i.e. it's the first error) */
8503 if (*error_handler == _Py_ERROR_UNKNOWN)
8504 *error_handler = _Py_GetErrorHandler(errors);
8505
8506 switch (*error_handler) {
8507 case _Py_ERROR_STRICT:
8508 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8509 return -1;
8510
8511 case _Py_ERROR_REPLACE:
8512 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8513 x = charmapencode_output('?', mapping, res, respos);
8514 if (x==enc_EXCEPTION) {
8515 return -1;
8516 }
8517 else if (x==enc_FAILED) {
8518 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8519 return -1;
8520 }
8521 }
8522 /* fall through */
8523 case _Py_ERROR_IGNORE:
8524 *inpos = collendpos;
8525 break;
8526
8527 case _Py_ERROR_XMLCHARREFREPLACE:
8528 /* generate replacement (temporarily (mis)uses p) */
8529 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8530 char buffer[2+29+1+1];
8531 char *cp;
8532 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8533 for (cp = buffer; *cp; ++cp) {
8534 x = charmapencode_output(*cp, mapping, res, respos);
8535 if (x==enc_EXCEPTION)
8536 return -1;
8537 else if (x==enc_FAILED) {
8538 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8539 return -1;
8540 }
8541 }
8542 }
8543 *inpos = collendpos;
8544 break;
8545
8546 default:
8547 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8548 encoding, reason, unicode, exceptionObject,
8549 collstartpos, collendpos, &newpos);
8550 if (repunicode == NULL)
8551 return -1;
8552 if (PyBytes_Check(repunicode)) {
8553 /* Directly copy bytes result to output. */
8554 Py_ssize_t outsize = PyBytes_Size(*res);
8555 Py_ssize_t requiredsize;
8556 repsize = PyBytes_Size(repunicode);
8557 requiredsize = *respos + repsize;
8558 if (requiredsize > outsize)
8559 /* Make room for all additional bytes. */
8560 if (charmapencode_resize(res, respos, requiredsize)) {
8561 Py_DECREF(repunicode);
8562 return -1;
8563 }
8564 memcpy(PyBytes_AsString(*res) + *respos,
8565 PyBytes_AsString(repunicode), repsize);
8566 *respos += repsize;
8567 *inpos = newpos;
8568 Py_DECREF(repunicode);
8569 break;
8570 }
8571 /* generate replacement */
8572 repsize = PyUnicode_GET_LENGTH(repunicode);
8573 data = PyUnicode_DATA(repunicode);
8574 kind = PyUnicode_KIND(repunicode);
8575 for (index = 0; index < repsize; index++) {
8576 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8577 x = charmapencode_output(repch, mapping, res, respos);
8578 if (x==enc_EXCEPTION) {
8579 Py_DECREF(repunicode);
8580 return -1;
8581 }
8582 else if (x==enc_FAILED) {
8583 Py_DECREF(repunicode);
8584 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8585 return -1;
8586 }
8587 }
8588 *inpos = newpos;
8589 Py_DECREF(repunicode);
8590 }
8591 return 0;
8592 }
8593
8594 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8595 _PyUnicode_EncodeCharmap(PyObject *unicode,
8596 PyObject *mapping,
8597 const char *errors)
8598 {
8599 /* output object */
8600 PyObject *res = NULL;
8601 /* current input position */
8602 Py_ssize_t inpos = 0;
8603 Py_ssize_t size;
8604 /* current output position */
8605 Py_ssize_t respos = 0;
8606 PyObject *error_handler_obj = NULL;
8607 PyObject *exc = NULL;
8608 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8609 const void *data;
8610 int kind;
8611
8612 size = PyUnicode_GET_LENGTH(unicode);
8613 data = PyUnicode_DATA(unicode);
8614 kind = PyUnicode_KIND(unicode);
8615
8616 /* Default to Latin-1 */
8617 if (mapping == NULL)
8618 return unicode_encode_ucs1(unicode, errors, 256);
8619
8620 /* allocate enough for a simple encoding without
8621 replacements, if we need more, we'll resize */
8622 res = PyBytes_FromStringAndSize(NULL, size);
8623 if (res == NULL)
8624 goto onError;
8625 if (size == 0)
8626 return res;
8627
8628 while (inpos<size) {
8629 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8630 /* try to encode it */
8631 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8632 if (x==enc_EXCEPTION) /* error */
8633 goto onError;
8634 if (x==enc_FAILED) { /* unencodable character */
8635 if (charmap_encoding_error(unicode, &inpos, mapping,
8636 &exc,
8637 &error_handler, &error_handler_obj, errors,
8638 &res, &respos)) {
8639 goto onError;
8640 }
8641 }
8642 else
8643 /* done with this character => adjust input position */
8644 ++inpos;
8645 }
8646
8647 /* Resize if we allocated to much */
8648 if (respos<PyBytes_GET_SIZE(res))
8649 if (_PyBytes_Resize(&res, respos) < 0)
8650 goto onError;
8651
8652 Py_XDECREF(exc);
8653 Py_XDECREF(error_handler_obj);
8654 return res;
8655
8656 onError:
8657 Py_XDECREF(res);
8658 Py_XDECREF(exc);
8659 Py_XDECREF(error_handler_obj);
8660 return NULL;
8661 }
8662
8663 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8664 PyUnicode_AsCharmapString(PyObject *unicode,
8665 PyObject *mapping)
8666 {
8667 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8668 PyErr_BadArgument();
8669 return NULL;
8670 }
8671 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8672 }
8673
8674 /* create or adjust a UnicodeTranslateError */
8675 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8676 make_translate_exception(PyObject **exceptionObject,
8677 PyObject *unicode,
8678 Py_ssize_t startpos, Py_ssize_t endpos,
8679 const char *reason)
8680 {
8681 if (*exceptionObject == NULL) {
8682 *exceptionObject = _PyUnicodeTranslateError_Create(
8683 unicode, startpos, endpos, reason);
8684 }
8685 else {
8686 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8687 goto onError;
8688 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8689 goto onError;
8690 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8691 goto onError;
8692 return;
8693 onError:
8694 Py_CLEAR(*exceptionObject);
8695 }
8696 }
8697
8698 /* error handling callback helper:
8699 build arguments, call the callback and check the arguments,
8700 put the result into newpos and return the replacement string, which
8701 has to be freed by the caller */
8702 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8703 unicode_translate_call_errorhandler(const char *errors,
8704 PyObject **errorHandler,
8705 const char *reason,
8706 PyObject *unicode, PyObject **exceptionObject,
8707 Py_ssize_t startpos, Py_ssize_t endpos,
8708 Py_ssize_t *newpos)
8709 {
8710 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8711
8712 Py_ssize_t i_newpos;
8713 PyObject *restuple;
8714 PyObject *resunicode;
8715
8716 if (*errorHandler == NULL) {
8717 *errorHandler = PyCodec_LookupError(errors);
8718 if (*errorHandler == NULL)
8719 return NULL;
8720 }
8721
8722 make_translate_exception(exceptionObject,
8723 unicode, startpos, endpos, reason);
8724 if (*exceptionObject == NULL)
8725 return NULL;
8726
8727 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8728 if (restuple == NULL)
8729 return NULL;
8730 if (!PyTuple_Check(restuple)) {
8731 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8732 Py_DECREF(restuple);
8733 return NULL;
8734 }
8735 if (!PyArg_ParseTuple(restuple, argparse,
8736 &resunicode, &i_newpos)) {
8737 Py_DECREF(restuple);
8738 return NULL;
8739 }
8740 if (i_newpos<0)
8741 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8742 else
8743 *newpos = i_newpos;
8744 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8745 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8746 Py_DECREF(restuple);
8747 return NULL;
8748 }
8749 Py_INCREF(resunicode);
8750 Py_DECREF(restuple);
8751 return resunicode;
8752 }
8753
8754 /* Lookup the character ch in the mapping and put the result in result,
8755 which must be decrefed by the caller.
8756 Return 0 on success, -1 on error */
8757 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8758 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8759 {
8760 PyObject *w = PyLong_FromLong((long)c);
8761 PyObject *x;
8762
8763 if (w == NULL)
8764 return -1;
8765 x = PyObject_GetItem(mapping, w);
8766 Py_DECREF(w);
8767 if (x == NULL) {
8768 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8769 /* No mapping found means: use 1:1 mapping. */
8770 PyErr_Clear();
8771 *result = NULL;
8772 return 0;
8773 } else
8774 return -1;
8775 }
8776 else if (x == Py_None) {
8777 *result = x;
8778 return 0;
8779 }
8780 else if (PyLong_Check(x)) {
8781 long value = PyLong_AS_LONG(x);
8782 if (value < 0 || value > MAX_UNICODE) {
8783 PyErr_Format(PyExc_ValueError,
8784 "character mapping must be in range(0x%x)",
8785 MAX_UNICODE+1);
8786 Py_DECREF(x);
8787 return -1;
8788 }
8789 *result = x;
8790 return 0;
8791 }
8792 else if (PyUnicode_Check(x)) {
8793 *result = x;
8794 return 0;
8795 }
8796 else {
8797 /* wrong return value */
8798 PyErr_SetString(PyExc_TypeError,
8799 "character mapping must return integer, None or str");
8800 Py_DECREF(x);
8801 return -1;
8802 }
8803 }
8804
8805 /* lookup the character, write the result into the writer.
8806 Return 1 if the result was written into the writer, return 0 if the mapping
8807 was undefined, raise an exception return -1 on error. */
8808 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8809 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8810 _PyUnicodeWriter *writer)
8811 {
8812 PyObject *item;
8813
8814 if (charmaptranslate_lookup(ch, mapping, &item))
8815 return -1;
8816
8817 if (item == NULL) {
8818 /* not found => default to 1:1 mapping */
8819 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8820 return -1;
8821 }
8822 return 1;
8823 }
8824
8825 if (item == Py_None) {
8826 Py_DECREF(item);
8827 return 0;
8828 }
8829
8830 if (PyLong_Check(item)) {
8831 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8832 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8833 used it */
8834 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8835 Py_DECREF(item);
8836 return -1;
8837 }
8838 Py_DECREF(item);
8839 return 1;
8840 }
8841
8842 if (!PyUnicode_Check(item)) {
8843 Py_DECREF(item);
8844 return -1;
8845 }
8846
8847 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8848 Py_DECREF(item);
8849 return -1;
8850 }
8851
8852 Py_DECREF(item);
8853 return 1;
8854 }
8855
8856 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8857 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8858 Py_UCS1 *translate)
8859 {
8860 PyObject *item = NULL;
8861 int ret = 0;
8862
8863 if (charmaptranslate_lookup(ch, mapping, &item)) {
8864 return -1;
8865 }
8866
8867 if (item == Py_None) {
8868 /* deletion */
8869 translate[ch] = 0xfe;
8870 }
8871 else if (item == NULL) {
8872 /* not found => default to 1:1 mapping */
8873 translate[ch] = ch;
8874 return 1;
8875 }
8876 else if (PyLong_Check(item)) {
8877 long replace = PyLong_AS_LONG(item);
8878 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8879 used it */
8880 if (127 < replace) {
8881 /* invalid character or character outside ASCII:
8882 skip the fast translate */
8883 goto exit;
8884 }
8885 translate[ch] = (Py_UCS1)replace;
8886 }
8887 else if (PyUnicode_Check(item)) {
8888 Py_UCS4 replace;
8889
8890 if (PyUnicode_GET_LENGTH(item) != 1)
8891 goto exit;
8892
8893 replace = PyUnicode_READ_CHAR(item, 0);
8894 if (replace > 127)
8895 goto exit;
8896 translate[ch] = (Py_UCS1)replace;
8897 }
8898 else {
8899 /* not None, NULL, long or unicode */
8900 goto exit;
8901 }
8902 ret = 1;
8903
8904 exit:
8905 Py_DECREF(item);
8906 return ret;
8907 }
8908
8909 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8910 was translated into writer, return 0 if the input string was partially
8911 translated into writer, raise an exception and return -1 on error. */
8912 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8913 unicode_fast_translate(PyObject *input, PyObject *mapping,
8914 _PyUnicodeWriter *writer, int ignore,
8915 Py_ssize_t *input_pos)
8916 {
8917 Py_UCS1 ascii_table[128], ch, ch2;
8918 Py_ssize_t len;
8919 const Py_UCS1 *in, *end;
8920 Py_UCS1 *out;
8921 int res = 0;
8922
8923 len = PyUnicode_GET_LENGTH(input);
8924
8925 memset(ascii_table, 0xff, 128);
8926
8927 in = PyUnicode_1BYTE_DATA(input);
8928 end = in + len;
8929
8930 assert(PyUnicode_IS_ASCII(writer->buffer));
8931 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8932 out = PyUnicode_1BYTE_DATA(writer->buffer);
8933
8934 for (; in < end; in++) {
8935 ch = *in;
8936 ch2 = ascii_table[ch];
8937 if (ch2 == 0xff) {
8938 int translate = unicode_fast_translate_lookup(mapping, ch,
8939 ascii_table);
8940 if (translate < 0)
8941 return -1;
8942 if (translate == 0)
8943 goto exit;
8944 ch2 = ascii_table[ch];
8945 }
8946 if (ch2 == 0xfe) {
8947 if (ignore)
8948 continue;
8949 goto exit;
8950 }
8951 assert(ch2 < 128);
8952 *out = ch2;
8953 out++;
8954 }
8955 res = 1;
8956
8957 exit:
8958 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8959 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8960 return res;
8961 }
8962
8963 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8964 _PyUnicode_TranslateCharmap(PyObject *input,
8965 PyObject *mapping,
8966 const char *errors)
8967 {
8968 /* input object */
8969 const void *data;
8970 Py_ssize_t size, i;
8971 int kind;
8972 /* output buffer */
8973 _PyUnicodeWriter writer;
8974 /* error handler */
8975 const char *reason = "character maps to <undefined>";
8976 PyObject *errorHandler = NULL;
8977 PyObject *exc = NULL;
8978 int ignore;
8979 int res;
8980
8981 if (mapping == NULL) {
8982 PyErr_BadArgument();
8983 return NULL;
8984 }
8985
8986 data = PyUnicode_DATA(input);
8987 kind = PyUnicode_KIND(input);
8988 size = PyUnicode_GET_LENGTH(input);
8989
8990 if (size == 0)
8991 return PyUnicode_FromObject(input);
8992
8993 /* allocate enough for a simple 1:1 translation without
8994 replacements, if we need more, we'll resize */
8995 _PyUnicodeWriter_Init(&writer);
8996 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8997 goto onError;
8998
8999 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9000
9001 if (PyUnicode_IS_ASCII(input)) {
9002 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9003 if (res < 0) {
9004 _PyUnicodeWriter_Dealloc(&writer);
9005 return NULL;
9006 }
9007 if (res == 1)
9008 return _PyUnicodeWriter_Finish(&writer);
9009 }
9010 else {
9011 i = 0;
9012 }
9013
9014 while (i<size) {
9015 /* try to encode it */
9016 int translate;
9017 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9018 Py_ssize_t newpos;
9019 /* startpos for collecting untranslatable chars */
9020 Py_ssize_t collstart;
9021 Py_ssize_t collend;
9022 Py_UCS4 ch;
9023
9024 ch = PyUnicode_READ(kind, data, i);
9025 translate = charmaptranslate_output(ch, mapping, &writer);
9026 if (translate < 0)
9027 goto onError;
9028
9029 if (translate != 0) {
9030 /* it worked => adjust input pointer */
9031 ++i;
9032 continue;
9033 }
9034
9035 /* untranslatable character */
9036 collstart = i;
9037 collend = i+1;
9038
9039 /* find all untranslatable characters */
9040 while (collend < size) {
9041 PyObject *x;
9042 ch = PyUnicode_READ(kind, data, collend);
9043 if (charmaptranslate_lookup(ch, mapping, &x))
9044 goto onError;
9045 Py_XDECREF(x);
9046 if (x != Py_None)
9047 break;
9048 ++collend;
9049 }
9050
9051 if (ignore) {
9052 i = collend;
9053 }
9054 else {
9055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9056 reason, input, &exc,
9057 collstart, collend, &newpos);
9058 if (repunicode == NULL)
9059 goto onError;
9060 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9061 Py_DECREF(repunicode);
9062 goto onError;
9063 }
9064 Py_DECREF(repunicode);
9065 i = newpos;
9066 }
9067 }
9068 Py_XDECREF(exc);
9069 Py_XDECREF(errorHandler);
9070 return _PyUnicodeWriter_Finish(&writer);
9071
9072 onError:
9073 _PyUnicodeWriter_Dealloc(&writer);
9074 Py_XDECREF(exc);
9075 Py_XDECREF(errorHandler);
9076 return NULL;
9077 }
9078
9079 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9080 PyUnicode_Translate(PyObject *str,
9081 PyObject *mapping,
9082 const char *errors)
9083 {
9084 if (ensure_unicode(str) < 0)
9085 return NULL;
9086 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9087 }
9088
9089 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9090 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9091 {
9092 if (!PyUnicode_Check(unicode)) {
9093 PyErr_BadInternalCall();
9094 return NULL;
9095 }
9096 if (PyUnicode_IS_ASCII(unicode)) {
9097 /* If the string is already ASCII, just return the same string */
9098 return Py_NewRef(unicode);
9099 }
9100
9101 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9102 PyObject *result = PyUnicode_New(len, 127);
9103 if (result == NULL) {
9104 return NULL;
9105 }
9106
9107 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9108 int kind = PyUnicode_KIND(unicode);
9109 const void *data = PyUnicode_DATA(unicode);
9110 Py_ssize_t i;
9111 for (i = 0; i < len; ++i) {
9112 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9113 if (ch < 127) {
9114 out[i] = ch;
9115 }
9116 else if (Py_UNICODE_ISSPACE(ch)) {
9117 out[i] = ' ';
9118 }
9119 else {
9120 int decimal = Py_UNICODE_TODECIMAL(ch);
9121 if (decimal < 0) {
9122 out[i] = '?';
9123 out[i+1] = '\0';
9124 _PyUnicode_LENGTH(result) = i + 1;
9125 break;
9126 }
9127 out[i] = '0' + decimal;
9128 }
9129 }
9130
9131 assert(_PyUnicode_CheckConsistency(result, 1));
9132 return result;
9133 }
9134
9135 /* --- Helpers ------------------------------------------------------------ */
9136
9137 /* helper macro to fixup start/end slice values */
9138 #define ADJUST_INDICES(start, end, len) \
9139 if (end > len) \
9140 end = len; \
9141 else if (end < 0) { \
9142 end += len; \
9143 if (end < 0) \
9144 end = 0; \
9145 } \
9146 if (start < 0) { \
9147 start += len; \
9148 if (start < 0) \
9149 start = 0; \
9150 }
9151
9152 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9153 any_find_slice(PyObject* s1, PyObject* s2,
9154 Py_ssize_t start,
9155 Py_ssize_t end,
9156 int direction)
9157 {
9158 int kind1, kind2;
9159 const void *buf1, *buf2;
9160 Py_ssize_t len1, len2, result;
9161
9162 kind1 = PyUnicode_KIND(s1);
9163 kind2 = PyUnicode_KIND(s2);
9164 if (kind1 < kind2)
9165 return -1;
9166
9167 len1 = PyUnicode_GET_LENGTH(s1);
9168 len2 = PyUnicode_GET_LENGTH(s2);
9169 ADJUST_INDICES(start, end, len1);
9170 if (end - start < len2)
9171 return -1;
9172
9173 buf1 = PyUnicode_DATA(s1);
9174 buf2 = PyUnicode_DATA(s2);
9175 if (len2 == 1) {
9176 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9177 result = findchar((const char *)buf1 + kind1*start,
9178 kind1, end - start, ch, direction);
9179 if (result == -1)
9180 return -1;
9181 else
9182 return start + result;
9183 }
9184
9185 if (kind2 != kind1) {
9186 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9187 if (!buf2)
9188 return -2;
9189 }
9190
9191 if (direction > 0) {
9192 switch (kind1) {
9193 case PyUnicode_1BYTE_KIND:
9194 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9195 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9196 else
9197 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9198 break;
9199 case PyUnicode_2BYTE_KIND:
9200 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9201 break;
9202 case PyUnicode_4BYTE_KIND:
9203 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9204 break;
9205 default:
9206 Py_UNREACHABLE();
9207 }
9208 }
9209 else {
9210 switch (kind1) {
9211 case PyUnicode_1BYTE_KIND:
9212 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9213 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9214 else
9215 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9216 break;
9217 case PyUnicode_2BYTE_KIND:
9218 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9219 break;
9220 case PyUnicode_4BYTE_KIND:
9221 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9222 break;
9223 default:
9224 Py_UNREACHABLE();
9225 }
9226 }
9227
9228 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9229 if (kind2 != kind1)
9230 PyMem_Free((void *)buf2);
9231
9232 return result;
9233 }
9234
9235 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9236 #include "stringlib/localeutil.h"
9237
9238 /**
9239 * InsertThousandsGrouping:
9240 * @writer: Unicode writer.
9241 * @n_buffer: Number of characters in @buffer.
9242 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9243 * @d_pos: Start of digits string.
9244 * @n_digits: The number of digits in the string, in which we want
9245 * to put the grouping chars.
9246 * @min_width: The minimum width of the digits in the output string.
9247 * Output will be zero-padded on the left to fill.
9248 * @grouping: see definition in localeconv().
9249 * @thousands_sep: see definition in localeconv().
9250 *
9251 * There are 2 modes: counting and filling. If @writer is NULL,
9252 * we are in counting mode, else filling mode.
9253 * If counting, the required buffer size is returned.
9254 * If filling, we know the buffer will be large enough, so we don't
9255 * need to pass in the buffer size.
9256 * Inserts thousand grouping characters (as defined by grouping and
9257 * thousands_sep) into @writer.
9258 *
9259 * Return value: -1 on error, number of characters otherwise.
9260 **/
9261 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9262 _PyUnicode_InsertThousandsGrouping(
9263 _PyUnicodeWriter *writer,
9264 Py_ssize_t n_buffer,
9265 PyObject *digits,
9266 Py_ssize_t d_pos,
9267 Py_ssize_t n_digits,
9268 Py_ssize_t min_width,
9269 const char *grouping,
9270 PyObject *thousands_sep,
9271 Py_UCS4 *maxchar)
9272 {
9273 min_width = Py_MAX(0, min_width);
9274 if (writer) {
9275 assert(digits != NULL);
9276 assert(maxchar == NULL);
9277 }
9278 else {
9279 assert(digits == NULL);
9280 assert(maxchar != NULL);
9281 }
9282 assert(0 <= d_pos);
9283 assert(0 <= n_digits);
9284 assert(grouping != NULL);
9285
9286 Py_ssize_t count = 0;
9287 Py_ssize_t n_zeros;
9288 int loop_broken = 0;
9289 int use_separator = 0; /* First time through, don't append the
9290 separator. They only go between
9291 groups. */
9292 Py_ssize_t buffer_pos;
9293 Py_ssize_t digits_pos;
9294 Py_ssize_t len;
9295 Py_ssize_t n_chars;
9296 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9297 be looked at */
9298 /* A generator that returns all of the grouping widths, until it
9299 returns 0. */
9300 GroupGenerator groupgen;
9301 GroupGenerator_init(&groupgen, grouping);
9302 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9303
9304 /* if digits are not grouped, thousands separator
9305 should be an empty string */
9306 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9307
9308 digits_pos = d_pos + n_digits;
9309 if (writer) {
9310 buffer_pos = writer->pos + n_buffer;
9311 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9312 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9313 }
9314 else {
9315 buffer_pos = n_buffer;
9316 }
9317
9318 if (!writer) {
9319 *maxchar = 127;
9320 }
9321
9322 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9323 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9324 n_zeros = Py_MAX(0, len - remaining);
9325 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9326
9327 /* Use n_zero zero's and n_chars chars */
9328
9329 /* Count only, don't do anything. */
9330 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9331
9332 /* Copy into the writer. */
9333 InsertThousandsGrouping_fill(writer, &buffer_pos,
9334 digits, &digits_pos,
9335 n_chars, n_zeros,
9336 use_separator ? thousands_sep : NULL,
9337 thousands_sep_len, maxchar);
9338
9339 /* Use a separator next time. */
9340 use_separator = 1;
9341
9342 remaining -= n_chars;
9343 min_width -= len;
9344
9345 if (remaining <= 0 && min_width <= 0) {
9346 loop_broken = 1;
9347 break;
9348 }
9349 min_width -= thousands_sep_len;
9350 }
9351 if (!loop_broken) {
9352 /* We left the loop without using a break statement. */
9353
9354 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9355 n_zeros = Py_MAX(0, len - remaining);
9356 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9357
9358 /* Use n_zero zero's and n_chars chars */
9359 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9360
9361 /* Copy into the writer. */
9362 InsertThousandsGrouping_fill(writer, &buffer_pos,
9363 digits, &digits_pos,
9364 n_chars, n_zeros,
9365 use_separator ? thousands_sep : NULL,
9366 thousands_sep_len, maxchar);
9367 }
9368 return count;
9369 }
9370
9371 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9372 PyUnicode_Count(PyObject *str,
9373 PyObject *substr,
9374 Py_ssize_t start,
9375 Py_ssize_t end)
9376 {
9377 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9378 return -1;
9379
9380 return unicode_count_impl(str, substr, start, end);
9381 }
9382
9383 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9384 PyUnicode_Find(PyObject *str,
9385 PyObject *substr,
9386 Py_ssize_t start,
9387 Py_ssize_t end,
9388 int direction)
9389 {
9390 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9391 return -2;
9392
9393 return any_find_slice(str, substr, start, end, direction);
9394 }
9395
9396 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9397 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9398 Py_ssize_t start, Py_ssize_t end,
9399 int direction)
9400 {
9401 int kind;
9402 Py_ssize_t len, result;
9403 len = PyUnicode_GET_LENGTH(str);
9404 ADJUST_INDICES(start, end, len);
9405 if (end - start < 1)
9406 return -1;
9407 kind = PyUnicode_KIND(str);
9408 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9409 kind, end-start, ch, direction);
9410 if (result == -1)
9411 return -1;
9412 else
9413 return start + result;
9414 }
9415
9416 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9417 tailmatch(PyObject *self,
9418 PyObject *substring,
9419 Py_ssize_t start,
9420 Py_ssize_t end,
9421 int direction)
9422 {
9423 int kind_self;
9424 int kind_sub;
9425 const void *data_self;
9426 const void *data_sub;
9427 Py_ssize_t offset;
9428 Py_ssize_t i;
9429 Py_ssize_t end_sub;
9430
9431 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9432 end -= PyUnicode_GET_LENGTH(substring);
9433 if (end < start)
9434 return 0;
9435
9436 if (PyUnicode_GET_LENGTH(substring) == 0)
9437 return 1;
9438
9439 kind_self = PyUnicode_KIND(self);
9440 data_self = PyUnicode_DATA(self);
9441 kind_sub = PyUnicode_KIND(substring);
9442 data_sub = PyUnicode_DATA(substring);
9443 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9444
9445 if (direction > 0)
9446 offset = end;
9447 else
9448 offset = start;
9449
9450 if (PyUnicode_READ(kind_self, data_self, offset) ==
9451 PyUnicode_READ(kind_sub, data_sub, 0) &&
9452 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9453 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9454 /* If both are of the same kind, memcmp is sufficient */
9455 if (kind_self == kind_sub) {
9456 return ! memcmp((char *)data_self +
9457 (offset * PyUnicode_KIND(substring)),
9458 data_sub,
9459 PyUnicode_GET_LENGTH(substring) *
9460 PyUnicode_KIND(substring));
9461 }
9462 /* otherwise we have to compare each character by first accessing it */
9463 else {
9464 /* We do not need to compare 0 and len(substring)-1 because
9465 the if statement above ensured already that they are equal
9466 when we end up here. */
9467 for (i = 1; i < end_sub; ++i) {
9468 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9469 PyUnicode_READ(kind_sub, data_sub, i))
9470 return 0;
9471 }
9472 return 1;
9473 }
9474 }
9475
9476 return 0;
9477 }
9478
9479 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9480 PyUnicode_Tailmatch(PyObject *str,
9481 PyObject *substr,
9482 Py_ssize_t start,
9483 Py_ssize_t end,
9484 int direction)
9485 {
9486 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9487 return -1;
9488
9489 return tailmatch(str, substr, start, end, direction);
9490 }
9491
9492 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9493 ascii_upper_or_lower(PyObject *self, int lower)
9494 {
9495 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9496 const char *data = PyUnicode_DATA(self);
9497 char *resdata;
9498 PyObject *res;
9499
9500 res = PyUnicode_New(len, 127);
9501 if (res == NULL)
9502 return NULL;
9503 resdata = PyUnicode_DATA(res);
9504 if (lower)
9505 _Py_bytes_lower(resdata, data, len);
9506 else
9507 _Py_bytes_upper(resdata, data, len);
9508 return res;
9509 }
9510
9511 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9512 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9513 {
9514 Py_ssize_t j;
9515 int final_sigma;
9516 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9517 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9518
9519 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9520
9521 where ! is a negation and \p{xxx} is a character with property xxx.
9522 */
9523 for (j = i - 1; j >= 0; j--) {
9524 c = PyUnicode_READ(kind, data, j);
9525 if (!_PyUnicode_IsCaseIgnorable(c))
9526 break;
9527 }
9528 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9529 if (final_sigma) {
9530 for (j = i + 1; j < length; j++) {
9531 c = PyUnicode_READ(kind, data, j);
9532 if (!_PyUnicode_IsCaseIgnorable(c))
9533 break;
9534 }
9535 final_sigma = j == length || !_PyUnicode_IsCased(c);
9536 }
9537 return (final_sigma) ? 0x3C2 : 0x3C3;
9538 }
9539
9540 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9541 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9542 Py_UCS4 c, Py_UCS4 *mapped)
9543 {
9544 /* Obscure special case. */
9545 if (c == 0x3A3) {
9546 mapped[0] = handle_capital_sigma(kind, data, length, i);
9547 return 1;
9548 }
9549 return _PyUnicode_ToLowerFull(c, mapped);
9550 }
9551
9552 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9553 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9554 {
9555 Py_ssize_t i, k = 0;
9556 int n_res, j;
9557 Py_UCS4 c, mapped[3];
9558
9559 c = PyUnicode_READ(kind, data, 0);
9560 n_res = _PyUnicode_ToTitleFull(c, mapped);
9561 for (j = 0; j < n_res; j++) {
9562 *maxchar = Py_MAX(*maxchar, mapped[j]);
9563 res[k++] = mapped[j];
9564 }
9565 for (i = 1; i < length; i++) {
9566 c = PyUnicode_READ(kind, data, i);
9567 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9568 for (j = 0; j < n_res; j++) {
9569 *maxchar = Py_MAX(*maxchar, mapped[j]);
9570 res[k++] = mapped[j];
9571 }
9572 }
9573 return k;
9574 }
9575
9576 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9577 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9578 Py_ssize_t i, k = 0;
9579
9580 for (i = 0; i < length; i++) {
9581 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9582 int n_res, j;
9583 if (Py_UNICODE_ISUPPER(c)) {
9584 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9585 }
9586 else if (Py_UNICODE_ISLOWER(c)) {
9587 n_res = _PyUnicode_ToUpperFull(c, mapped);
9588 }
9589 else {
9590 n_res = 1;
9591 mapped[0] = c;
9592 }
9593 for (j = 0; j < n_res; j++) {
9594 *maxchar = Py_MAX(*maxchar, mapped[j]);
9595 res[k++] = mapped[j];
9596 }
9597 }
9598 return k;
9599 }
9600
9601 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9602 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9603 Py_UCS4 *maxchar, int lower)
9604 {
9605 Py_ssize_t i, k = 0;
9606
9607 for (i = 0; i < length; i++) {
9608 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9609 int n_res, j;
9610 if (lower)
9611 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9612 else
9613 n_res = _PyUnicode_ToUpperFull(c, mapped);
9614 for (j = 0; j < n_res; j++) {
9615 *maxchar = Py_MAX(*maxchar, mapped[j]);
9616 res[k++] = mapped[j];
9617 }
9618 }
9619 return k;
9620 }
9621
9622 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9623 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9624 {
9625 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9626 }
9627
9628 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9629 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9630 {
9631 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9632 }
9633
9634 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9635 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9636 {
9637 Py_ssize_t i, k = 0;
9638
9639 for (i = 0; i < length; i++) {
9640 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9641 Py_UCS4 mapped[3];
9642 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9643 for (j = 0; j < n_res; j++) {
9644 *maxchar = Py_MAX(*maxchar, mapped[j]);
9645 res[k++] = mapped[j];
9646 }
9647 }
9648 return k;
9649 }
9650
9651 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9652 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9653 {
9654 Py_ssize_t i, k = 0;
9655 int previous_is_cased;
9656
9657 previous_is_cased = 0;
9658 for (i = 0; i < length; i++) {
9659 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9660 Py_UCS4 mapped[3];
9661 int n_res, j;
9662
9663 if (previous_is_cased)
9664 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9665 else
9666 n_res = _PyUnicode_ToTitleFull(c, mapped);
9667
9668 for (j = 0; j < n_res; j++) {
9669 *maxchar = Py_MAX(*maxchar, mapped[j]);
9670 res[k++] = mapped[j];
9671 }
9672
9673 previous_is_cased = _PyUnicode_IsCased(c);
9674 }
9675 return k;
9676 }
9677
9678 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9679 case_operation(PyObject *self,
9680 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9681 {
9682 PyObject *res = NULL;
9683 Py_ssize_t length, newlength = 0;
9684 int kind, outkind;
9685 const void *data;
9686 void *outdata;
9687 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9688
9689 kind = PyUnicode_KIND(self);
9690 data = PyUnicode_DATA(self);
9691 length = PyUnicode_GET_LENGTH(self);
9692 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9693 PyErr_SetString(PyExc_OverflowError, "string is too long");
9694 return NULL;
9695 }
9696 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9697 if (tmp == NULL)
9698 return PyErr_NoMemory();
9699 newlength = perform(kind, data, length, tmp, &maxchar);
9700 res = PyUnicode_New(newlength, maxchar);
9701 if (res == NULL)
9702 goto leave;
9703 tmpend = tmp + newlength;
9704 outdata = PyUnicode_DATA(res);
9705 outkind = PyUnicode_KIND(res);
9706 switch (outkind) {
9707 case PyUnicode_1BYTE_KIND:
9708 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9709 break;
9710 case PyUnicode_2BYTE_KIND:
9711 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9712 break;
9713 case PyUnicode_4BYTE_KIND:
9714 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9715 break;
9716 default:
9717 Py_UNREACHABLE();
9718 }
9719 leave:
9720 PyMem_Free(tmp);
9721 return res;
9722 }
9723
9724 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9725 PyUnicode_Join(PyObject *separator, PyObject *seq)
9726 {
9727 PyObject *res;
9728 PyObject *fseq;
9729 Py_ssize_t seqlen;
9730 PyObject **items;
9731
9732 fseq = PySequence_Fast(seq, "can only join an iterable");
9733 if (fseq == NULL) {
9734 return NULL;
9735 }
9736
9737 Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9738
9739 items = PySequence_Fast_ITEMS(fseq);
9740 seqlen = PySequence_Fast_GET_SIZE(fseq);
9741 res = _PyUnicode_JoinArray(separator, items, seqlen);
9742
9743 Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9744
9745 Py_DECREF(fseq);
9746 return res;
9747 }
9748
9749 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9750 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9751 {
9752 PyObject *res = NULL; /* the result */
9753 PyObject *sep = NULL;
9754 Py_ssize_t seplen;
9755 PyObject *item;
9756 Py_ssize_t sz, i, res_offset;
9757 Py_UCS4 maxchar;
9758 Py_UCS4 item_maxchar;
9759 int use_memcpy;
9760 unsigned char *res_data = NULL, *sep_data = NULL;
9761 PyObject *last_obj;
9762 int kind = 0;
9763
9764 /* If empty sequence, return u"". */
9765 if (seqlen == 0) {
9766 _Py_RETURN_UNICODE_EMPTY();
9767 }
9768
9769 /* If singleton sequence with an exact Unicode, return that. */
9770 last_obj = NULL;
9771 if (seqlen == 1) {
9772 if (PyUnicode_CheckExact(items[0])) {
9773 res = items[0];
9774 return Py_NewRef(res);
9775 }
9776 seplen = 0;
9777 maxchar = 0;
9778 }
9779 else {
9780 /* Set up sep and seplen */
9781 if (separator == NULL) {
9782 /* fall back to a blank space separator */
9783 sep = PyUnicode_FromOrdinal(' ');
9784 if (!sep)
9785 goto onError;
9786 seplen = 1;
9787 maxchar = 32;
9788 }
9789 else {
9790 if (!PyUnicode_Check(separator)) {
9791 PyErr_Format(PyExc_TypeError,
9792 "separator: expected str instance,"
9793 " %.80s found",
9794 Py_TYPE(separator)->tp_name);
9795 goto onError;
9796 }
9797 sep = separator;
9798 seplen = PyUnicode_GET_LENGTH(separator);
9799 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9800 /* inc refcount to keep this code path symmetric with the
9801 above case of a blank separator */
9802 Py_INCREF(sep);
9803 }
9804 last_obj = sep;
9805 }
9806
9807 /* There are at least two things to join, or else we have a subclass
9808 * of str in the sequence.
9809 * Do a pre-pass to figure out the total amount of space we'll
9810 * need (sz), and see whether all argument are strings.
9811 */
9812 sz = 0;
9813 #ifdef Py_DEBUG
9814 use_memcpy = 0;
9815 #else
9816 use_memcpy = 1;
9817 #endif
9818 for (i = 0; i < seqlen; i++) {
9819 size_t add_sz;
9820 item = items[i];
9821 if (!PyUnicode_Check(item)) {
9822 PyErr_Format(PyExc_TypeError,
9823 "sequence item %zd: expected str instance,"
9824 " %.80s found",
9825 i, Py_TYPE(item)->tp_name);
9826 goto onError;
9827 }
9828 add_sz = PyUnicode_GET_LENGTH(item);
9829 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9830 maxchar = Py_MAX(maxchar, item_maxchar);
9831 if (i != 0) {
9832 add_sz += seplen;
9833 }
9834 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
9835 PyErr_SetString(PyExc_OverflowError,
9836 "join() result is too long for a Python string");
9837 goto onError;
9838 }
9839 sz += add_sz;
9840 if (use_memcpy && last_obj != NULL) {
9841 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9842 use_memcpy = 0;
9843 }
9844 last_obj = item;
9845 }
9846
9847 res = PyUnicode_New(sz, maxchar);
9848 if (res == NULL)
9849 goto onError;
9850
9851 /* Catenate everything. */
9852 #ifdef Py_DEBUG
9853 use_memcpy = 0;
9854 #else
9855 if (use_memcpy) {
9856 res_data = PyUnicode_1BYTE_DATA(res);
9857 kind = PyUnicode_KIND(res);
9858 if (seplen != 0)
9859 sep_data = PyUnicode_1BYTE_DATA(sep);
9860 }
9861 #endif
9862 if (use_memcpy) {
9863 for (i = 0; i < seqlen; ++i) {
9864 Py_ssize_t itemlen;
9865 item = items[i];
9866
9867 /* Copy item, and maybe the separator. */
9868 if (i && seplen != 0) {
9869 memcpy(res_data,
9870 sep_data,
9871 kind * seplen);
9872 res_data += kind * seplen;
9873 }
9874
9875 itemlen = PyUnicode_GET_LENGTH(item);
9876 if (itemlen != 0) {
9877 memcpy(res_data,
9878 PyUnicode_DATA(item),
9879 kind * itemlen);
9880 res_data += kind * itemlen;
9881 }
9882 }
9883 assert(res_data == PyUnicode_1BYTE_DATA(res)
9884 + kind * PyUnicode_GET_LENGTH(res));
9885 }
9886 else {
9887 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9888 Py_ssize_t itemlen;
9889 item = items[i];
9890
9891 /* Copy item, and maybe the separator. */
9892 if (i && seplen != 0) {
9893 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9894 res_offset += seplen;
9895 }
9896
9897 itemlen = PyUnicode_GET_LENGTH(item);
9898 if (itemlen != 0) {
9899 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9900 res_offset += itemlen;
9901 }
9902 }
9903 assert(res_offset == PyUnicode_GET_LENGTH(res));
9904 }
9905
9906 Py_XDECREF(sep);
9907 assert(_PyUnicode_CheckConsistency(res, 1));
9908 return res;
9909
9910 onError:
9911 Py_XDECREF(sep);
9912 Py_XDECREF(res);
9913 return NULL;
9914 }
9915
9916 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)9917 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9918 Py_UCS4 fill_char)
9919 {
9920 const int kind = PyUnicode_KIND(unicode);
9921 void *data = PyUnicode_DATA(unicode);
9922 assert(unicode_modifiable(unicode));
9923 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9924 assert(start >= 0);
9925 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9926 unicode_fill(kind, data, fill_char, start, length);
9927 }
9928
9929 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)9930 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9931 Py_UCS4 fill_char)
9932 {
9933 Py_ssize_t maxlen;
9934
9935 if (!PyUnicode_Check(unicode)) {
9936 PyErr_BadInternalCall();
9937 return -1;
9938 }
9939 if (unicode_check_modifiable(unicode))
9940 return -1;
9941
9942 if (start < 0) {
9943 PyErr_SetString(PyExc_IndexError, "string index out of range");
9944 return -1;
9945 }
9946 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9947 PyErr_SetString(PyExc_ValueError,
9948 "fill character is bigger than "
9949 "the string maximum character");
9950 return -1;
9951 }
9952
9953 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9954 length = Py_MIN(maxlen, length);
9955 if (length <= 0)
9956 return 0;
9957
9958 _PyUnicode_FastFill(unicode, start, length, fill_char);
9959 return length;
9960 }
9961
9962 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)9963 pad(PyObject *self,
9964 Py_ssize_t left,
9965 Py_ssize_t right,
9966 Py_UCS4 fill)
9967 {
9968 PyObject *u;
9969 Py_UCS4 maxchar;
9970 int kind;
9971 void *data;
9972
9973 if (left < 0)
9974 left = 0;
9975 if (right < 0)
9976 right = 0;
9977
9978 if (left == 0 && right == 0)
9979 return unicode_result_unchanged(self);
9980
9981 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9982 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9983 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9984 return NULL;
9985 }
9986 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9987 maxchar = Py_MAX(maxchar, fill);
9988 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9989 if (!u)
9990 return NULL;
9991
9992 kind = PyUnicode_KIND(u);
9993 data = PyUnicode_DATA(u);
9994 if (left)
9995 unicode_fill(kind, data, fill, 0, left);
9996 if (right)
9997 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9998 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9999 assert(_PyUnicode_CheckConsistency(u, 1));
10000 return u;
10001 }
10002
10003 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10004 PyUnicode_Splitlines(PyObject *string, int keepends)
10005 {
10006 PyObject *list;
10007
10008 if (ensure_unicode(string) < 0)
10009 return NULL;
10010
10011 switch (PyUnicode_KIND(string)) {
10012 case PyUnicode_1BYTE_KIND:
10013 if (PyUnicode_IS_ASCII(string))
10014 list = asciilib_splitlines(
10015 string, PyUnicode_1BYTE_DATA(string),
10016 PyUnicode_GET_LENGTH(string), keepends);
10017 else
10018 list = ucs1lib_splitlines(
10019 string, PyUnicode_1BYTE_DATA(string),
10020 PyUnicode_GET_LENGTH(string), keepends);
10021 break;
10022 case PyUnicode_2BYTE_KIND:
10023 list = ucs2lib_splitlines(
10024 string, PyUnicode_2BYTE_DATA(string),
10025 PyUnicode_GET_LENGTH(string), keepends);
10026 break;
10027 case PyUnicode_4BYTE_KIND:
10028 list = ucs4lib_splitlines(
10029 string, PyUnicode_4BYTE_DATA(string),
10030 PyUnicode_GET_LENGTH(string), keepends);
10031 break;
10032 default:
10033 Py_UNREACHABLE();
10034 }
10035 return list;
10036 }
10037
10038 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10039 split(PyObject *self,
10040 PyObject *substring,
10041 Py_ssize_t maxcount)
10042 {
10043 int kind1, kind2;
10044 const void *buf1, *buf2;
10045 Py_ssize_t len1, len2;
10046 PyObject* out;
10047 len1 = PyUnicode_GET_LENGTH(self);
10048 kind1 = PyUnicode_KIND(self);
10049
10050 if (substring == NULL) {
10051 if (maxcount < 0) {
10052 maxcount = (len1 - 1) / 2 + 1;
10053 }
10054 switch (kind1) {
10055 case PyUnicode_1BYTE_KIND:
10056 if (PyUnicode_IS_ASCII(self))
10057 return asciilib_split_whitespace(
10058 self, PyUnicode_1BYTE_DATA(self),
10059 len1, maxcount
10060 );
10061 else
10062 return ucs1lib_split_whitespace(
10063 self, PyUnicode_1BYTE_DATA(self),
10064 len1, maxcount
10065 );
10066 case PyUnicode_2BYTE_KIND:
10067 return ucs2lib_split_whitespace(
10068 self, PyUnicode_2BYTE_DATA(self),
10069 len1, maxcount
10070 );
10071 case PyUnicode_4BYTE_KIND:
10072 return ucs4lib_split_whitespace(
10073 self, PyUnicode_4BYTE_DATA(self),
10074 len1, maxcount
10075 );
10076 default:
10077 Py_UNREACHABLE();
10078 }
10079 }
10080
10081 kind2 = PyUnicode_KIND(substring);
10082 len2 = PyUnicode_GET_LENGTH(substring);
10083 if (maxcount < 0) {
10084 // if len2 == 0, it will raise ValueError.
10085 maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10086 // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10087 maxcount = maxcount < 0 ? len1 : maxcount;
10088 }
10089 if (kind1 < kind2 || len1 < len2) {
10090 out = PyList_New(1);
10091 if (out == NULL)
10092 return NULL;
10093 PyList_SET_ITEM(out, 0, Py_NewRef(self));
10094 return out;
10095 }
10096 buf1 = PyUnicode_DATA(self);
10097 buf2 = PyUnicode_DATA(substring);
10098 if (kind2 != kind1) {
10099 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10100 if (!buf2)
10101 return NULL;
10102 }
10103
10104 switch (kind1) {
10105 case PyUnicode_1BYTE_KIND:
10106 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10107 out = asciilib_split(
10108 self, buf1, len1, buf2, len2, maxcount);
10109 else
10110 out = ucs1lib_split(
10111 self, buf1, len1, buf2, len2, maxcount);
10112 break;
10113 case PyUnicode_2BYTE_KIND:
10114 out = ucs2lib_split(
10115 self, buf1, len1, buf2, len2, maxcount);
10116 break;
10117 case PyUnicode_4BYTE_KIND:
10118 out = ucs4lib_split(
10119 self, buf1, len1, buf2, len2, maxcount);
10120 break;
10121 default:
10122 out = NULL;
10123 }
10124 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10125 if (kind2 != kind1)
10126 PyMem_Free((void *)buf2);
10127 return out;
10128 }
10129
10130 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10131 rsplit(PyObject *self,
10132 PyObject *substring,
10133 Py_ssize_t maxcount)
10134 {
10135 int kind1, kind2;
10136 const void *buf1, *buf2;
10137 Py_ssize_t len1, len2;
10138 PyObject* out;
10139
10140 len1 = PyUnicode_GET_LENGTH(self);
10141 kind1 = PyUnicode_KIND(self);
10142
10143 if (substring == NULL) {
10144 if (maxcount < 0) {
10145 maxcount = (len1 - 1) / 2 + 1;
10146 }
10147 switch (kind1) {
10148 case PyUnicode_1BYTE_KIND:
10149 if (PyUnicode_IS_ASCII(self))
10150 return asciilib_rsplit_whitespace(
10151 self, PyUnicode_1BYTE_DATA(self),
10152 len1, maxcount
10153 );
10154 else
10155 return ucs1lib_rsplit_whitespace(
10156 self, PyUnicode_1BYTE_DATA(self),
10157 len1, maxcount
10158 );
10159 case PyUnicode_2BYTE_KIND:
10160 return ucs2lib_rsplit_whitespace(
10161 self, PyUnicode_2BYTE_DATA(self),
10162 len1, maxcount
10163 );
10164 case PyUnicode_4BYTE_KIND:
10165 return ucs4lib_rsplit_whitespace(
10166 self, PyUnicode_4BYTE_DATA(self),
10167 len1, maxcount
10168 );
10169 default:
10170 Py_UNREACHABLE();
10171 }
10172 }
10173 kind2 = PyUnicode_KIND(substring);
10174 len2 = PyUnicode_GET_LENGTH(substring);
10175 if (maxcount < 0) {
10176 // if len2 == 0, it will raise ValueError.
10177 maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10178 // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10179 maxcount = maxcount < 0 ? len1 : maxcount;
10180 }
10181 if (kind1 < kind2 || len1 < len2) {
10182 out = PyList_New(1);
10183 if (out == NULL)
10184 return NULL;
10185 PyList_SET_ITEM(out, 0, Py_NewRef(self));
10186 return out;
10187 }
10188 buf1 = PyUnicode_DATA(self);
10189 buf2 = PyUnicode_DATA(substring);
10190 if (kind2 != kind1) {
10191 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10192 if (!buf2)
10193 return NULL;
10194 }
10195
10196 switch (kind1) {
10197 case PyUnicode_1BYTE_KIND:
10198 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10199 out = asciilib_rsplit(
10200 self, buf1, len1, buf2, len2, maxcount);
10201 else
10202 out = ucs1lib_rsplit(
10203 self, buf1, len1, buf2, len2, maxcount);
10204 break;
10205 case PyUnicode_2BYTE_KIND:
10206 out = ucs2lib_rsplit(
10207 self, buf1, len1, buf2, len2, maxcount);
10208 break;
10209 case PyUnicode_4BYTE_KIND:
10210 out = ucs4lib_rsplit(
10211 self, buf1, len1, buf2, len2, maxcount);
10212 break;
10213 default:
10214 out = NULL;
10215 }
10216 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10217 if (kind2 != kind1)
10218 PyMem_Free((void *)buf2);
10219 return out;
10220 }
10221
10222 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10223 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10224 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10225 {
10226 switch (kind) {
10227 case PyUnicode_1BYTE_KIND:
10228 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10229 return asciilib_find(buf1, len1, buf2, len2, offset);
10230 else
10231 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10232 case PyUnicode_2BYTE_KIND:
10233 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10234 case PyUnicode_4BYTE_KIND:
10235 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10236 }
10237 Py_UNREACHABLE();
10238 }
10239
10240 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10241 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10242 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10243 {
10244 switch (kind) {
10245 case PyUnicode_1BYTE_KIND:
10246 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10247 case PyUnicode_2BYTE_KIND:
10248 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10249 case PyUnicode_4BYTE_KIND:
10250 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10251 }
10252 Py_UNREACHABLE();
10253 }
10254
10255 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10256 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10257 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10258 {
10259 int kind = PyUnicode_KIND(u);
10260 void *data = PyUnicode_DATA(u);
10261 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10262 if (kind == PyUnicode_1BYTE_KIND) {
10263 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10264 (Py_UCS1 *)data + len,
10265 u1, u2, maxcount);
10266 }
10267 else if (kind == PyUnicode_2BYTE_KIND) {
10268 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10269 (Py_UCS2 *)data + len,
10270 u1, u2, maxcount);
10271 }
10272 else {
10273 assert(kind == PyUnicode_4BYTE_KIND);
10274 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10275 (Py_UCS4 *)data + len,
10276 u1, u2, maxcount);
10277 }
10278 }
10279
10280 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10281 replace(PyObject *self, PyObject *str1,
10282 PyObject *str2, Py_ssize_t maxcount)
10283 {
10284 PyObject *u;
10285 const char *sbuf = PyUnicode_DATA(self);
10286 const void *buf1 = PyUnicode_DATA(str1);
10287 const void *buf2 = PyUnicode_DATA(str2);
10288 int srelease = 0, release1 = 0, release2 = 0;
10289 int skind = PyUnicode_KIND(self);
10290 int kind1 = PyUnicode_KIND(str1);
10291 int kind2 = PyUnicode_KIND(str2);
10292 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10293 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10294 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10295 int mayshrink;
10296 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10297
10298 if (slen < len1)
10299 goto nothing;
10300
10301 if (maxcount < 0)
10302 maxcount = PY_SSIZE_T_MAX;
10303 else if (maxcount == 0)
10304 goto nothing;
10305
10306 if (str1 == str2)
10307 goto nothing;
10308
10309 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10310 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10311 if (maxchar < maxchar_str1)
10312 /* substring too wide to be present */
10313 goto nothing;
10314 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10315 /* Replacing str1 with str2 may cause a maxchar reduction in the
10316 result string. */
10317 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10318 maxchar = Py_MAX(maxchar, maxchar_str2);
10319
10320 if (len1 == len2) {
10321 /* same length */
10322 if (len1 == 0)
10323 goto nothing;
10324 if (len1 == 1) {
10325 /* replace characters */
10326 Py_UCS4 u1, u2;
10327 Py_ssize_t pos;
10328
10329 u1 = PyUnicode_READ(kind1, buf1, 0);
10330 pos = findchar(sbuf, skind, slen, u1, 1);
10331 if (pos < 0)
10332 goto nothing;
10333 u2 = PyUnicode_READ(kind2, buf2, 0);
10334 u = PyUnicode_New(slen, maxchar);
10335 if (!u)
10336 goto error;
10337
10338 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10339 replace_1char_inplace(u, pos, u1, u2, maxcount);
10340 }
10341 else {
10342 int rkind = skind;
10343 char *res;
10344 Py_ssize_t i;
10345
10346 if (kind1 < rkind) {
10347 /* widen substring */
10348 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10349 if (!buf1) goto error;
10350 release1 = 1;
10351 }
10352 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10353 if (i < 0)
10354 goto nothing;
10355 if (rkind > kind2) {
10356 /* widen replacement */
10357 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10358 if (!buf2) goto error;
10359 release2 = 1;
10360 }
10361 else if (rkind < kind2) {
10362 /* widen self and buf1 */
10363 rkind = kind2;
10364 if (release1) {
10365 assert(buf1 != PyUnicode_DATA(str1));
10366 PyMem_Free((void *)buf1);
10367 buf1 = PyUnicode_DATA(str1);
10368 release1 = 0;
10369 }
10370 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10371 if (!sbuf) goto error;
10372 srelease = 1;
10373 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10374 if (!buf1) goto error;
10375 release1 = 1;
10376 }
10377 u = PyUnicode_New(slen, maxchar);
10378 if (!u)
10379 goto error;
10380 assert(PyUnicode_KIND(u) == rkind);
10381 res = PyUnicode_DATA(u);
10382
10383 memcpy(res, sbuf, rkind * slen);
10384 /* change everything in-place, starting with this one */
10385 memcpy(res + rkind * i,
10386 buf2,
10387 rkind * len2);
10388 i += len1;
10389
10390 while ( --maxcount > 0) {
10391 i = anylib_find(rkind, self,
10392 sbuf+rkind*i, slen-i,
10393 str1, buf1, len1, i);
10394 if (i == -1)
10395 break;
10396 memcpy(res + rkind * i,
10397 buf2,
10398 rkind * len2);
10399 i += len1;
10400 }
10401 }
10402 }
10403 else {
10404 Py_ssize_t n, i, j, ires;
10405 Py_ssize_t new_size;
10406 int rkind = skind;
10407 char *res;
10408
10409 if (kind1 < rkind) {
10410 /* widen substring */
10411 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10412 if (!buf1) goto error;
10413 release1 = 1;
10414 }
10415 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10416 if (n == 0)
10417 goto nothing;
10418 if (kind2 < rkind) {
10419 /* widen replacement */
10420 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10421 if (!buf2) goto error;
10422 release2 = 1;
10423 }
10424 else if (kind2 > rkind) {
10425 /* widen self and buf1 */
10426 rkind = kind2;
10427 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10428 if (!sbuf) goto error;
10429 srelease = 1;
10430 if (release1) {
10431 assert(buf1 != PyUnicode_DATA(str1));
10432 PyMem_Free((void *)buf1);
10433 buf1 = PyUnicode_DATA(str1);
10434 release1 = 0;
10435 }
10436 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10437 if (!buf1) goto error;
10438 release1 = 1;
10439 }
10440 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10441 PyUnicode_GET_LENGTH(str1)); */
10442 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10443 PyErr_SetString(PyExc_OverflowError,
10444 "replace string is too long");
10445 goto error;
10446 }
10447 new_size = slen + n * (len2 - len1);
10448 if (new_size == 0) {
10449 u = unicode_get_empty();
10450 goto done;
10451 }
10452 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10453 PyErr_SetString(PyExc_OverflowError,
10454 "replace string is too long");
10455 goto error;
10456 }
10457 u = PyUnicode_New(new_size, maxchar);
10458 if (!u)
10459 goto error;
10460 assert(PyUnicode_KIND(u) == rkind);
10461 res = PyUnicode_DATA(u);
10462 ires = i = 0;
10463 if (len1 > 0) {
10464 while (n-- > 0) {
10465 /* look for next match */
10466 j = anylib_find(rkind, self,
10467 sbuf + rkind * i, slen-i,
10468 str1, buf1, len1, i);
10469 if (j == -1)
10470 break;
10471 else if (j > i) {
10472 /* copy unchanged part [i:j] */
10473 memcpy(res + rkind * ires,
10474 sbuf + rkind * i,
10475 rkind * (j-i));
10476 ires += j - i;
10477 }
10478 /* copy substitution string */
10479 if (len2 > 0) {
10480 memcpy(res + rkind * ires,
10481 buf2,
10482 rkind * len2);
10483 ires += len2;
10484 }
10485 i = j + len1;
10486 }
10487 if (i < slen)
10488 /* copy tail [i:] */
10489 memcpy(res + rkind * ires,
10490 sbuf + rkind * i,
10491 rkind * (slen-i));
10492 }
10493 else {
10494 /* interleave */
10495 while (n > 0) {
10496 memcpy(res + rkind * ires,
10497 buf2,
10498 rkind * len2);
10499 ires += len2;
10500 if (--n <= 0)
10501 break;
10502 memcpy(res + rkind * ires,
10503 sbuf + rkind * i,
10504 rkind);
10505 ires++;
10506 i++;
10507 }
10508 memcpy(res + rkind * ires,
10509 sbuf + rkind * i,
10510 rkind * (slen-i));
10511 }
10512 }
10513
10514 if (mayshrink) {
10515 unicode_adjust_maxchar(&u);
10516 if (u == NULL)
10517 goto error;
10518 }
10519
10520 done:
10521 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10522 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10523 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10524 if (srelease)
10525 PyMem_Free((void *)sbuf);
10526 if (release1)
10527 PyMem_Free((void *)buf1);
10528 if (release2)
10529 PyMem_Free((void *)buf2);
10530 assert(_PyUnicode_CheckConsistency(u, 1));
10531 return u;
10532
10533 nothing:
10534 /* nothing to replace; return original string (when possible) */
10535 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10536 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10537 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10538 if (srelease)
10539 PyMem_Free((void *)sbuf);
10540 if (release1)
10541 PyMem_Free((void *)buf1);
10542 if (release2)
10543 PyMem_Free((void *)buf2);
10544 return unicode_result_unchanged(self);
10545
10546 error:
10547 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10548 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10549 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10550 if (srelease)
10551 PyMem_Free((void *)sbuf);
10552 if (release1)
10553 PyMem_Free((void *)buf1);
10554 if (release2)
10555 PyMem_Free((void *)buf2);
10556 return NULL;
10557 }
10558
10559 /* --- Unicode Object Methods --------------------------------------------- */
10560
10561 /*[clinic input]
10562 str.title as unicode_title
10563
10564 Return a version of the string where each word is titlecased.
10565
10566 More specifically, words start with uppercased characters and all remaining
10567 cased characters have lower case.
10568 [clinic start generated code]*/
10569
10570 static PyObject *
unicode_title_impl(PyObject * self)10571 unicode_title_impl(PyObject *self)
10572 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10573 {
10574 return case_operation(self, do_title);
10575 }
10576
10577 /*[clinic input]
10578 str.capitalize as unicode_capitalize
10579
10580 Return a capitalized version of the string.
10581
10582 More specifically, make the first character have upper case and the rest lower
10583 case.
10584 [clinic start generated code]*/
10585
10586 static PyObject *
unicode_capitalize_impl(PyObject * self)10587 unicode_capitalize_impl(PyObject *self)
10588 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10589 {
10590 if (PyUnicode_GET_LENGTH(self) == 0)
10591 return unicode_result_unchanged(self);
10592 return case_operation(self, do_capitalize);
10593 }
10594
10595 /*[clinic input]
10596 str.casefold as unicode_casefold
10597
10598 Return a version of the string suitable for caseless comparisons.
10599 [clinic start generated code]*/
10600
10601 static PyObject *
unicode_casefold_impl(PyObject * self)10602 unicode_casefold_impl(PyObject *self)
10603 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10604 {
10605 if (PyUnicode_IS_ASCII(self))
10606 return ascii_upper_or_lower(self, 1);
10607 return case_operation(self, do_casefold);
10608 }
10609
10610
10611 /* Argument converter. Accepts a single Unicode character. */
10612
10613 static int
convert_uc(PyObject * obj,void * addr)10614 convert_uc(PyObject *obj, void *addr)
10615 {
10616 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10617
10618 if (!PyUnicode_Check(obj)) {
10619 PyErr_Format(PyExc_TypeError,
10620 "The fill character must be a unicode character, "
10621 "not %.100s", Py_TYPE(obj)->tp_name);
10622 return 0;
10623 }
10624 if (PyUnicode_GET_LENGTH(obj) != 1) {
10625 PyErr_SetString(PyExc_TypeError,
10626 "The fill character must be exactly one character long");
10627 return 0;
10628 }
10629 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10630 return 1;
10631 }
10632
10633 /*[clinic input]
10634 str.center as unicode_center
10635
10636 width: Py_ssize_t
10637 fillchar: Py_UCS4 = ' '
10638 /
10639
10640 Return a centered string of length width.
10641
10642 Padding is done using the specified fill character (default is a space).
10643 [clinic start generated code]*/
10644
10645 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10646 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10647 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10648 {
10649 Py_ssize_t marg, left;
10650
10651 if (PyUnicode_GET_LENGTH(self) >= width)
10652 return unicode_result_unchanged(self);
10653
10654 marg = width - PyUnicode_GET_LENGTH(self);
10655 left = marg / 2 + (marg & width & 1);
10656
10657 return pad(self, left, marg - left, fillchar);
10658 }
10659
10660 /* This function assumes that str1 and str2 are readied by the caller. */
10661
10662 static int
unicode_compare(PyObject * str1,PyObject * str2)10663 unicode_compare(PyObject *str1, PyObject *str2)
10664 {
10665 #define COMPARE(TYPE1, TYPE2) \
10666 do { \
10667 TYPE1* p1 = (TYPE1 *)data1; \
10668 TYPE2* p2 = (TYPE2 *)data2; \
10669 TYPE1* end = p1 + len; \
10670 Py_UCS4 c1, c2; \
10671 for (; p1 != end; p1++, p2++) { \
10672 c1 = *p1; \
10673 c2 = *p2; \
10674 if (c1 != c2) \
10675 return (c1 < c2) ? -1 : 1; \
10676 } \
10677 } \
10678 while (0)
10679
10680 int kind1, kind2;
10681 const void *data1, *data2;
10682 Py_ssize_t len1, len2, len;
10683
10684 kind1 = PyUnicode_KIND(str1);
10685 kind2 = PyUnicode_KIND(str2);
10686 data1 = PyUnicode_DATA(str1);
10687 data2 = PyUnicode_DATA(str2);
10688 len1 = PyUnicode_GET_LENGTH(str1);
10689 len2 = PyUnicode_GET_LENGTH(str2);
10690 len = Py_MIN(len1, len2);
10691
10692 switch(kind1) {
10693 case PyUnicode_1BYTE_KIND:
10694 {
10695 switch(kind2) {
10696 case PyUnicode_1BYTE_KIND:
10697 {
10698 int cmp = memcmp(data1, data2, len);
10699 /* normalize result of memcmp() into the range [-1; 1] */
10700 if (cmp < 0)
10701 return -1;
10702 if (cmp > 0)
10703 return 1;
10704 break;
10705 }
10706 case PyUnicode_2BYTE_KIND:
10707 COMPARE(Py_UCS1, Py_UCS2);
10708 break;
10709 case PyUnicode_4BYTE_KIND:
10710 COMPARE(Py_UCS1, Py_UCS4);
10711 break;
10712 default:
10713 Py_UNREACHABLE();
10714 }
10715 break;
10716 }
10717 case PyUnicode_2BYTE_KIND:
10718 {
10719 switch(kind2) {
10720 case PyUnicode_1BYTE_KIND:
10721 COMPARE(Py_UCS2, Py_UCS1);
10722 break;
10723 case PyUnicode_2BYTE_KIND:
10724 {
10725 COMPARE(Py_UCS2, Py_UCS2);
10726 break;
10727 }
10728 case PyUnicode_4BYTE_KIND:
10729 COMPARE(Py_UCS2, Py_UCS4);
10730 break;
10731 default:
10732 Py_UNREACHABLE();
10733 }
10734 break;
10735 }
10736 case PyUnicode_4BYTE_KIND:
10737 {
10738 switch(kind2) {
10739 case PyUnicode_1BYTE_KIND:
10740 COMPARE(Py_UCS4, Py_UCS1);
10741 break;
10742 case PyUnicode_2BYTE_KIND:
10743 COMPARE(Py_UCS4, Py_UCS2);
10744 break;
10745 case PyUnicode_4BYTE_KIND:
10746 {
10747 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10748 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10749 /* normalize result of wmemcmp() into the range [-1; 1] */
10750 if (cmp < 0)
10751 return -1;
10752 if (cmp > 0)
10753 return 1;
10754 #else
10755 COMPARE(Py_UCS4, Py_UCS4);
10756 #endif
10757 break;
10758 }
10759 default:
10760 Py_UNREACHABLE();
10761 }
10762 break;
10763 }
10764 default:
10765 Py_UNREACHABLE();
10766 }
10767
10768 if (len1 == len2)
10769 return 0;
10770 if (len1 < len2)
10771 return -1;
10772 else
10773 return 1;
10774
10775 #undef COMPARE
10776 }
10777
10778 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10779 unicode_compare_eq(PyObject *str1, PyObject *str2)
10780 {
10781 int kind;
10782 const void *data1, *data2;
10783 Py_ssize_t len;
10784 int cmp;
10785
10786 len = PyUnicode_GET_LENGTH(str1);
10787 if (PyUnicode_GET_LENGTH(str2) != len)
10788 return 0;
10789 kind = PyUnicode_KIND(str1);
10790 if (PyUnicode_KIND(str2) != kind)
10791 return 0;
10792 data1 = PyUnicode_DATA(str1);
10793 data2 = PyUnicode_DATA(str2);
10794
10795 cmp = memcmp(data1, data2, len * kind);
10796 return (cmp == 0);
10797 }
10798
10799 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)10800 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
10801 {
10802 assert(PyUnicode_Check(str1));
10803 assert(PyUnicode_Check(str2));
10804 if (str1 == str2) {
10805 return 1;
10806 }
10807 return unicode_compare_eq(str1, str2);
10808 }
10809
10810
10811 int
PyUnicode_Compare(PyObject * left,PyObject * right)10812 PyUnicode_Compare(PyObject *left, PyObject *right)
10813 {
10814 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10815 /* a string is equal to itself */
10816 if (left == right)
10817 return 0;
10818
10819 return unicode_compare(left, right);
10820 }
10821 PyErr_Format(PyExc_TypeError,
10822 "Can't compare %.100s and %.100s",
10823 Py_TYPE(left)->tp_name,
10824 Py_TYPE(right)->tp_name);
10825 return -1;
10826 }
10827
10828 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)10829 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10830 {
10831 Py_ssize_t i;
10832 int kind;
10833 Py_UCS4 chr;
10834
10835 assert(_PyUnicode_CHECK(uni));
10836 kind = PyUnicode_KIND(uni);
10837 if (kind == PyUnicode_1BYTE_KIND) {
10838 const void *data = PyUnicode_1BYTE_DATA(uni);
10839 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10840 size_t len, len2 = strlen(str);
10841 int cmp;
10842
10843 len = Py_MIN(len1, len2);
10844 cmp = memcmp(data, str, len);
10845 if (cmp != 0) {
10846 if (cmp < 0)
10847 return -1;
10848 else
10849 return 1;
10850 }
10851 if (len1 > len2)
10852 return 1; /* uni is longer */
10853 if (len1 < len2)
10854 return -1; /* str is longer */
10855 return 0;
10856 }
10857 else {
10858 const void *data = PyUnicode_DATA(uni);
10859 /* Compare Unicode string and source character set string */
10860 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10861 if (chr != (unsigned char)str[i])
10862 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10863 /* This check keeps Python strings that end in '\0' from comparing equal
10864 to C strings identical up to that point. */
10865 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10866 return 1; /* uni is longer */
10867 if (str[i])
10868 return -1; /* str is longer */
10869 return 0;
10870 }
10871 }
10872
10873 int
PyUnicode_EqualToUTF8(PyObject * unicode,const char * str)10874 PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
10875 {
10876 return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
10877 }
10878
10879 int
PyUnicode_EqualToUTF8AndSize(PyObject * unicode,const char * str,Py_ssize_t size)10880 PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
10881 {
10882 assert(_PyUnicode_CHECK(unicode));
10883 assert(str);
10884
10885 if (PyUnicode_IS_ASCII(unicode)) {
10886 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10887 return size == len &&
10888 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10889 }
10890 if (PyUnicode_UTF8(unicode) != NULL) {
10891 Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
10892 return size == len &&
10893 memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
10894 }
10895
10896 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10897 if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
10898 return 0;
10899 }
10900 const unsigned char *s = (const unsigned char *)str;
10901 const unsigned char *ends = s + (size_t)size;
10902 int kind = PyUnicode_KIND(unicode);
10903 const void *data = PyUnicode_DATA(unicode);
10904 /* Compare Unicode string and UTF-8 string */
10905 for (Py_ssize_t i = 0; i < len; i++) {
10906 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10907 if (ch < 0x80) {
10908 if (ends == s || s[0] != ch) {
10909 return 0;
10910 }
10911 s += 1;
10912 }
10913 else if (ch < 0x800) {
10914 if ((ends - s) < 2 ||
10915 s[0] != (0xc0 | (ch >> 6)) ||
10916 s[1] != (0x80 | (ch & 0x3f)))
10917 {
10918 return 0;
10919 }
10920 s += 2;
10921 }
10922 else if (ch < 0x10000) {
10923 if (Py_UNICODE_IS_SURROGATE(ch) ||
10924 (ends - s) < 3 ||
10925 s[0] != (0xe0 | (ch >> 12)) ||
10926 s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
10927 s[2] != (0x80 | (ch & 0x3f)))
10928 {
10929 return 0;
10930 }
10931 s += 3;
10932 }
10933 else {
10934 assert(ch <= MAX_UNICODE);
10935 if ((ends - s) < 4 ||
10936 s[0] != (0xf0 | (ch >> 18)) ||
10937 s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
10938 s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
10939 s[3] != (0x80 | (ch & 0x3f)))
10940 {
10941 return 0;
10942 }
10943 s += 4;
10944 }
10945 }
10946 return s == ends;
10947 }
10948
10949 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)10950 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10951 {
10952 size_t len;
10953 assert(_PyUnicode_CHECK(unicode));
10954 assert(str);
10955 #ifndef NDEBUG
10956 for (const char *p = str; *p; p++) {
10957 assert((unsigned char)*p < 128);
10958 }
10959 #endif
10960 if (!PyUnicode_IS_ASCII(unicode))
10961 return 0;
10962 len = (size_t)PyUnicode_GET_LENGTH(unicode);
10963 return strlen(str) == len &&
10964 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10965 }
10966
10967 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)10968 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
10969 {
10970 PyObject *right_uni;
10971
10972 assert(_PyUnicode_CHECK(left));
10973 assert(right->string);
10974 #ifndef NDEBUG
10975 for (const char *p = right->string; *p; p++) {
10976 assert((unsigned char)*p < 128);
10977 }
10978 #endif
10979
10980 if (!PyUnicode_IS_ASCII(left))
10981 return 0;
10982
10983 right_uni = _PyUnicode_FromId(right); /* borrowed */
10984 if (right_uni == NULL) {
10985 /* memory error or bad data */
10986 PyErr_Clear();
10987 return _PyUnicode_EqualToASCIIString(left, right->string);
10988 }
10989
10990 if (left == right_uni)
10991 return 1;
10992
10993 assert(PyUnicode_CHECK_INTERNED(right_uni));
10994 if (PyUnicode_CHECK_INTERNED(left)) {
10995 return 0;
10996 }
10997
10998 Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni));
10999 assert(right_hash != -1);
11000 Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left));
11001 if (hash != -1 && hash != right_hash) {
11002 return 0;
11003 }
11004
11005 return unicode_compare_eq(left, right_uni);
11006 }
11007
11008 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11009 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11010 {
11011 int result;
11012
11013 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11014 Py_RETURN_NOTIMPLEMENTED;
11015
11016 if (left == right) {
11017 switch (op) {
11018 case Py_EQ:
11019 case Py_LE:
11020 case Py_GE:
11021 /* a string is equal to itself */
11022 Py_RETURN_TRUE;
11023 case Py_NE:
11024 case Py_LT:
11025 case Py_GT:
11026 Py_RETURN_FALSE;
11027 default:
11028 PyErr_BadArgument();
11029 return NULL;
11030 }
11031 }
11032 else if (op == Py_EQ || op == Py_NE) {
11033 result = unicode_compare_eq(left, right);
11034 result ^= (op == Py_NE);
11035 return PyBool_FromLong(result);
11036 }
11037 else {
11038 result = unicode_compare(left, right);
11039 Py_RETURN_RICHCOMPARE(result, 0, op);
11040 }
11041 }
11042
11043 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11044 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11045 {
11046 return unicode_eq(aa, bb);
11047 }
11048
11049 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11050 PyUnicode_Contains(PyObject *str, PyObject *substr)
11051 {
11052 int kind1, kind2;
11053 const void *buf1, *buf2;
11054 Py_ssize_t len1, len2;
11055 int result;
11056
11057 if (!PyUnicode_Check(substr)) {
11058 PyErr_Format(PyExc_TypeError,
11059 "'in <string>' requires string as left operand, not %.100s",
11060 Py_TYPE(substr)->tp_name);
11061 return -1;
11062 }
11063 if (ensure_unicode(str) < 0)
11064 return -1;
11065
11066 kind1 = PyUnicode_KIND(str);
11067 kind2 = PyUnicode_KIND(substr);
11068 if (kind1 < kind2)
11069 return 0;
11070 len1 = PyUnicode_GET_LENGTH(str);
11071 len2 = PyUnicode_GET_LENGTH(substr);
11072 if (len1 < len2)
11073 return 0;
11074 buf1 = PyUnicode_DATA(str);
11075 buf2 = PyUnicode_DATA(substr);
11076 if (len2 == 1) {
11077 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11078 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11079 return result;
11080 }
11081 if (kind2 != kind1) {
11082 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11083 if (!buf2)
11084 return -1;
11085 }
11086
11087 switch (kind1) {
11088 case PyUnicode_1BYTE_KIND:
11089 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11090 break;
11091 case PyUnicode_2BYTE_KIND:
11092 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11093 break;
11094 case PyUnicode_4BYTE_KIND:
11095 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11096 break;
11097 default:
11098 Py_UNREACHABLE();
11099 }
11100
11101 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11102 if (kind2 != kind1)
11103 PyMem_Free((void *)buf2);
11104
11105 return result;
11106 }
11107
11108 /* Concat to string or Unicode object giving a new Unicode object. */
11109
11110 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11111 PyUnicode_Concat(PyObject *left, PyObject *right)
11112 {
11113 PyObject *result;
11114 Py_UCS4 maxchar, maxchar2;
11115 Py_ssize_t left_len, right_len, new_len;
11116
11117 if (ensure_unicode(left) < 0)
11118 return NULL;
11119
11120 if (!PyUnicode_Check(right)) {
11121 PyErr_Format(PyExc_TypeError,
11122 "can only concatenate str (not \"%.200s\") to str",
11123 Py_TYPE(right)->tp_name);
11124 return NULL;
11125 }
11126
11127 /* Shortcuts */
11128 PyObject *empty = unicode_get_empty(); // Borrowed reference
11129 if (left == empty) {
11130 return PyUnicode_FromObject(right);
11131 }
11132 if (right == empty) {
11133 return PyUnicode_FromObject(left);
11134 }
11135
11136 left_len = PyUnicode_GET_LENGTH(left);
11137 right_len = PyUnicode_GET_LENGTH(right);
11138 if (left_len > PY_SSIZE_T_MAX - right_len) {
11139 PyErr_SetString(PyExc_OverflowError,
11140 "strings are too large to concat");
11141 return NULL;
11142 }
11143 new_len = left_len + right_len;
11144
11145 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11146 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11147 maxchar = Py_MAX(maxchar, maxchar2);
11148
11149 /* Concat the two Unicode strings */
11150 result = PyUnicode_New(new_len, maxchar);
11151 if (result == NULL)
11152 return NULL;
11153 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11154 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11155 assert(_PyUnicode_CheckConsistency(result, 1));
11156 return result;
11157 }
11158
11159 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11160 PyUnicode_Append(PyObject **p_left, PyObject *right)
11161 {
11162 PyObject *left, *res;
11163 Py_UCS4 maxchar, maxchar2;
11164 Py_ssize_t left_len, right_len, new_len;
11165
11166 if (p_left == NULL) {
11167 if (!PyErr_Occurred())
11168 PyErr_BadInternalCall();
11169 return;
11170 }
11171 left = *p_left;
11172 if (right == NULL || left == NULL
11173 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11174 if (!PyErr_Occurred())
11175 PyErr_BadInternalCall();
11176 goto error;
11177 }
11178
11179 /* Shortcuts */
11180 PyObject *empty = unicode_get_empty(); // Borrowed reference
11181 if (left == empty) {
11182 Py_DECREF(left);
11183 *p_left = Py_NewRef(right);
11184 return;
11185 }
11186 if (right == empty) {
11187 return;
11188 }
11189
11190 left_len = PyUnicode_GET_LENGTH(left);
11191 right_len = PyUnicode_GET_LENGTH(right);
11192 if (left_len > PY_SSIZE_T_MAX - right_len) {
11193 PyErr_SetString(PyExc_OverflowError,
11194 "strings are too large to concat");
11195 goto error;
11196 }
11197 new_len = left_len + right_len;
11198
11199 if (unicode_modifiable(left)
11200 && PyUnicode_CheckExact(right)
11201 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11202 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11203 to change the structure size, but characters are stored just after
11204 the structure, and so it requires to move all characters which is
11205 not so different than duplicating the string. */
11206 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11207 {
11208 /* append inplace */
11209 if (unicode_resize(p_left, new_len) != 0)
11210 goto error;
11211
11212 /* copy 'right' into the newly allocated area of 'left' */
11213 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11214 }
11215 else {
11216 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11217 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11218 maxchar = Py_MAX(maxchar, maxchar2);
11219
11220 /* Concat the two Unicode strings */
11221 res = PyUnicode_New(new_len, maxchar);
11222 if (res == NULL)
11223 goto error;
11224 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11225 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11226 Py_DECREF(left);
11227 *p_left = res;
11228 }
11229 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11230 return;
11231
11232 error:
11233 Py_CLEAR(*p_left);
11234 }
11235
11236 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11237 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11238 {
11239 PyUnicode_Append(pleft, right);
11240 Py_XDECREF(right);
11241 }
11242
11243 /*[clinic input]
11244 @text_signature "($self, sub[, start[, end]], /)"
11245 str.count as unicode_count -> Py_ssize_t
11246
11247 self as str: self
11248 sub as substr: unicode
11249 start: slice_index(accept={int, NoneType}, c_default='0') = None
11250 end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11251 /
11252
11253 Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11254
11255 Optional arguments start and end are interpreted as in slice notation.
11256 [clinic start generated code]*/
11257
11258 static Py_ssize_t
unicode_count_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11259 unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11260 Py_ssize_t end)
11261 /*[clinic end generated code: output=8fcc3aef0b18edbf input=6f168ffd94be8785]*/
11262 {
11263 assert(PyUnicode_Check(str));
11264 assert(PyUnicode_Check(substr));
11265
11266 Py_ssize_t result;
11267 int kind1, kind2;
11268 const void *buf1 = NULL, *buf2 = NULL;
11269 Py_ssize_t len1, len2;
11270
11271 kind1 = PyUnicode_KIND(str);
11272 kind2 = PyUnicode_KIND(substr);
11273 if (kind1 < kind2)
11274 return 0;
11275
11276 len1 = PyUnicode_GET_LENGTH(str);
11277 len2 = PyUnicode_GET_LENGTH(substr);
11278 ADJUST_INDICES(start, end, len1);
11279 if (end - start < len2)
11280 return 0;
11281
11282 buf1 = PyUnicode_DATA(str);
11283 buf2 = PyUnicode_DATA(substr);
11284 if (kind2 != kind1) {
11285 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11286 if (!buf2)
11287 goto onError;
11288 }
11289
11290 // We don't reuse `anylib_count` here because of the explicit casts.
11291 switch (kind1) {
11292 case PyUnicode_1BYTE_KIND:
11293 result = ucs1lib_count(
11294 ((const Py_UCS1*)buf1) + start, end - start,
11295 buf2, len2, PY_SSIZE_T_MAX
11296 );
11297 break;
11298 case PyUnicode_2BYTE_KIND:
11299 result = ucs2lib_count(
11300 ((const Py_UCS2*)buf1) + start, end - start,
11301 buf2, len2, PY_SSIZE_T_MAX
11302 );
11303 break;
11304 case PyUnicode_4BYTE_KIND:
11305 result = ucs4lib_count(
11306 ((const Py_UCS4*)buf1) + start, end - start,
11307 buf2, len2, PY_SSIZE_T_MAX
11308 );
11309 break;
11310 default:
11311 Py_UNREACHABLE();
11312 }
11313
11314 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11315 if (kind2 != kind1)
11316 PyMem_Free((void *)buf2);
11317
11318 return result;
11319 onError:
11320 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11321 if (kind2 != kind1)
11322 PyMem_Free((void *)buf2);
11323 return -1;
11324 }
11325
11326 /*[clinic input]
11327 str.encode as unicode_encode
11328
11329 encoding: str(c_default="NULL") = 'utf-8'
11330 The encoding in which to encode the string.
11331 errors: str(c_default="NULL") = 'strict'
11332 The error handling scheme to use for encoding errors.
11333 The default is 'strict' meaning that encoding errors raise a
11334 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11335 'xmlcharrefreplace' as well as any other name registered with
11336 codecs.register_error that can handle UnicodeEncodeErrors.
11337
11338 Encode the string using the codec registered for encoding.
11339 [clinic start generated code]*/
11340
11341 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11342 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11343 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11344 {
11345 return PyUnicode_AsEncodedString(self, encoding, errors);
11346 }
11347
11348 /*[clinic input]
11349 str.expandtabs as unicode_expandtabs
11350
11351 tabsize: int = 8
11352
11353 Return a copy where all tab characters are expanded using spaces.
11354
11355 If tabsize is not given, a tab size of 8 characters is assumed.
11356 [clinic start generated code]*/
11357
11358 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11359 unicode_expandtabs_impl(PyObject *self, int tabsize)
11360 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11361 {
11362 Py_ssize_t i, j, line_pos, src_len, incr;
11363 Py_UCS4 ch;
11364 PyObject *u;
11365 const void *src_data;
11366 void *dest_data;
11367 int kind;
11368 int found;
11369
11370 /* First pass: determine size of output string */
11371 src_len = PyUnicode_GET_LENGTH(self);
11372 i = j = line_pos = 0;
11373 kind = PyUnicode_KIND(self);
11374 src_data = PyUnicode_DATA(self);
11375 found = 0;
11376 for (; i < src_len; i++) {
11377 ch = PyUnicode_READ(kind, src_data, i);
11378 if (ch == '\t') {
11379 found = 1;
11380 if (tabsize > 0) {
11381 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11382 if (j > PY_SSIZE_T_MAX - incr)
11383 goto overflow;
11384 line_pos += incr;
11385 j += incr;
11386 }
11387 }
11388 else {
11389 if (j > PY_SSIZE_T_MAX - 1)
11390 goto overflow;
11391 line_pos++;
11392 j++;
11393 if (ch == '\n' || ch == '\r')
11394 line_pos = 0;
11395 }
11396 }
11397 if (!found)
11398 return unicode_result_unchanged(self);
11399
11400 /* Second pass: create output string and fill it */
11401 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11402 if (!u)
11403 return NULL;
11404 dest_data = PyUnicode_DATA(u);
11405
11406 i = j = line_pos = 0;
11407
11408 for (; i < src_len; i++) {
11409 ch = PyUnicode_READ(kind, src_data, i);
11410 if (ch == '\t') {
11411 if (tabsize > 0) {
11412 incr = tabsize - (line_pos % tabsize);
11413 line_pos += incr;
11414 unicode_fill(kind, dest_data, ' ', j, incr);
11415 j += incr;
11416 }
11417 }
11418 else {
11419 line_pos++;
11420 PyUnicode_WRITE(kind, dest_data, j, ch);
11421 j++;
11422 if (ch == '\n' || ch == '\r')
11423 line_pos = 0;
11424 }
11425 }
11426 assert (j == PyUnicode_GET_LENGTH(u));
11427 return unicode_result(u);
11428
11429 overflow:
11430 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11431 return NULL;
11432 }
11433
11434 /*[clinic input]
11435 str.find as unicode_find = str.count
11436
11437 Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11438
11439 Optional arguments start and end are interpreted as in slice notation.
11440 Return -1 on failure.
11441 [clinic start generated code]*/
11442
11443 static Py_ssize_t
unicode_find_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11444 unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11445 Py_ssize_t end)
11446 /*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/
11447 {
11448 Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11449 if (result < 0) {
11450 return -1;
11451 }
11452 return result;
11453 }
11454
11455 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11456 unicode_getitem(PyObject *self, Py_ssize_t index)
11457 {
11458 const void *data;
11459 int kind;
11460 Py_UCS4 ch;
11461
11462 if (!PyUnicode_Check(self)) {
11463 PyErr_BadArgument();
11464 return NULL;
11465 }
11466 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11467 PyErr_SetString(PyExc_IndexError, "string index out of range");
11468 return NULL;
11469 }
11470 kind = PyUnicode_KIND(self);
11471 data = PyUnicode_DATA(self);
11472 ch = PyUnicode_READ(kind, data, index);
11473 return unicode_char(ch);
11474 }
11475
11476 /* Believe it or not, this produces the same value for ASCII strings
11477 as bytes_hash(). */
11478 static Py_hash_t
unicode_hash(PyObject * self)11479 unicode_hash(PyObject *self)
11480 {
11481 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11482
11483 #ifdef Py_DEBUG
11484 assert(_Py_HashSecret_Initialized);
11485 #endif
11486 Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self));
11487 if (hash != -1) {
11488 return hash;
11489 }
11490 x = _Py_HashBytes(PyUnicode_DATA(self),
11491 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11492
11493 FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x);
11494 return x;
11495 }
11496
11497 /*[clinic input]
11498 str.index as unicode_index = str.count
11499
11500 Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11501
11502 Optional arguments start and end are interpreted as in slice notation.
11503 Raises ValueError when the substring is not found.
11504 [clinic start generated code]*/
11505
11506 static Py_ssize_t
unicode_index_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11507 unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11508 Py_ssize_t end)
11509 /*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/
11510 {
11511 Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11512 if (result == -1) {
11513 PyErr_SetString(PyExc_ValueError, "substring not found");
11514 }
11515 else if (result < 0) {
11516 return -1;
11517 }
11518 return result;
11519 }
11520
11521 /*[clinic input]
11522 str.isascii as unicode_isascii
11523
11524 Return True if all characters in the string are ASCII, False otherwise.
11525
11526 ASCII characters have code points in the range U+0000-U+007F.
11527 Empty string is ASCII too.
11528 [clinic start generated code]*/
11529
11530 static PyObject *
unicode_isascii_impl(PyObject * self)11531 unicode_isascii_impl(PyObject *self)
11532 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11533 {
11534 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11535 }
11536
11537 /*[clinic input]
11538 str.islower as unicode_islower
11539
11540 Return True if the string is a lowercase string, False otherwise.
11541
11542 A string is lowercase if all cased characters in the string are lowercase and
11543 there is at least one cased character in the string.
11544 [clinic start generated code]*/
11545
11546 static PyObject *
unicode_islower_impl(PyObject * self)11547 unicode_islower_impl(PyObject *self)
11548 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11549 {
11550 Py_ssize_t i, length;
11551 int kind;
11552 const void *data;
11553 int cased;
11554
11555 length = PyUnicode_GET_LENGTH(self);
11556 kind = PyUnicode_KIND(self);
11557 data = PyUnicode_DATA(self);
11558
11559 /* Shortcut for single character strings */
11560 if (length == 1)
11561 return PyBool_FromLong(
11562 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11563
11564 /* Special case for empty strings */
11565 if (length == 0)
11566 Py_RETURN_FALSE;
11567
11568 cased = 0;
11569 for (i = 0; i < length; i++) {
11570 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11571
11572 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11573 Py_RETURN_FALSE;
11574 else if (!cased && Py_UNICODE_ISLOWER(ch))
11575 cased = 1;
11576 }
11577 return PyBool_FromLong(cased);
11578 }
11579
11580 /*[clinic input]
11581 str.isupper as unicode_isupper
11582
11583 Return True if the string is an uppercase string, False otherwise.
11584
11585 A string is uppercase if all cased characters in the string are uppercase and
11586 there is at least one cased character in the string.
11587 [clinic start generated code]*/
11588
11589 static PyObject *
unicode_isupper_impl(PyObject * self)11590 unicode_isupper_impl(PyObject *self)
11591 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11592 {
11593 Py_ssize_t i, length;
11594 int kind;
11595 const void *data;
11596 int cased;
11597
11598 length = PyUnicode_GET_LENGTH(self);
11599 kind = PyUnicode_KIND(self);
11600 data = PyUnicode_DATA(self);
11601
11602 /* Shortcut for single character strings */
11603 if (length == 1)
11604 return PyBool_FromLong(
11605 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11606
11607 /* Special case for empty strings */
11608 if (length == 0)
11609 Py_RETURN_FALSE;
11610
11611 cased = 0;
11612 for (i = 0; i < length; i++) {
11613 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11614
11615 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11616 Py_RETURN_FALSE;
11617 else if (!cased && Py_UNICODE_ISUPPER(ch))
11618 cased = 1;
11619 }
11620 return PyBool_FromLong(cased);
11621 }
11622
11623 /*[clinic input]
11624 str.istitle as unicode_istitle
11625
11626 Return True if the string is a title-cased string, False otherwise.
11627
11628 In a title-cased string, upper- and title-case characters may only
11629 follow uncased characters and lowercase characters only cased ones.
11630 [clinic start generated code]*/
11631
11632 static PyObject *
unicode_istitle_impl(PyObject * self)11633 unicode_istitle_impl(PyObject *self)
11634 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11635 {
11636 Py_ssize_t i, length;
11637 int kind;
11638 const void *data;
11639 int cased, previous_is_cased;
11640
11641 length = PyUnicode_GET_LENGTH(self);
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_DATA(self);
11644
11645 /* Shortcut for single character strings */
11646 if (length == 1) {
11647 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11648 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11649 (Py_UNICODE_ISUPPER(ch) != 0));
11650 }
11651
11652 /* Special case for empty strings */
11653 if (length == 0)
11654 Py_RETURN_FALSE;
11655
11656 cased = 0;
11657 previous_is_cased = 0;
11658 for (i = 0; i < length; i++) {
11659 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11660
11661 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11662 if (previous_is_cased)
11663 Py_RETURN_FALSE;
11664 previous_is_cased = 1;
11665 cased = 1;
11666 }
11667 else if (Py_UNICODE_ISLOWER(ch)) {
11668 if (!previous_is_cased)
11669 Py_RETURN_FALSE;
11670 previous_is_cased = 1;
11671 cased = 1;
11672 }
11673 else
11674 previous_is_cased = 0;
11675 }
11676 return PyBool_FromLong(cased);
11677 }
11678
11679 /*[clinic input]
11680 str.isspace as unicode_isspace
11681
11682 Return True if the string is a whitespace string, False otherwise.
11683
11684 A string is whitespace if all characters in the string are whitespace and there
11685 is at least one character in the string.
11686 [clinic start generated code]*/
11687
11688 static PyObject *
unicode_isspace_impl(PyObject * self)11689 unicode_isspace_impl(PyObject *self)
11690 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11691 {
11692 Py_ssize_t i, length;
11693 int kind;
11694 const void *data;
11695
11696 length = PyUnicode_GET_LENGTH(self);
11697 kind = PyUnicode_KIND(self);
11698 data = PyUnicode_DATA(self);
11699
11700 /* Shortcut for single character strings */
11701 if (length == 1)
11702 return PyBool_FromLong(
11703 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11704
11705 /* Special case for empty strings */
11706 if (length == 0)
11707 Py_RETURN_FALSE;
11708
11709 for (i = 0; i < length; i++) {
11710 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11711 if (!Py_UNICODE_ISSPACE(ch))
11712 Py_RETURN_FALSE;
11713 }
11714 Py_RETURN_TRUE;
11715 }
11716
11717 /*[clinic input]
11718 str.isalpha as unicode_isalpha
11719
11720 Return True if the string is an alphabetic string, False otherwise.
11721
11722 A string is alphabetic if all characters in the string are alphabetic and there
11723 is at least one character in the string.
11724 [clinic start generated code]*/
11725
11726 static PyObject *
unicode_isalpha_impl(PyObject * self)11727 unicode_isalpha_impl(PyObject *self)
11728 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11729 {
11730 Py_ssize_t i, length;
11731 int kind;
11732 const void *data;
11733
11734 length = PyUnicode_GET_LENGTH(self);
11735 kind = PyUnicode_KIND(self);
11736 data = PyUnicode_DATA(self);
11737
11738 /* Shortcut for single character strings */
11739 if (length == 1)
11740 return PyBool_FromLong(
11741 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11742
11743 /* Special case for empty strings */
11744 if (length == 0)
11745 Py_RETURN_FALSE;
11746
11747 for (i = 0; i < length; i++) {
11748 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11749 Py_RETURN_FALSE;
11750 }
11751 Py_RETURN_TRUE;
11752 }
11753
11754 /*[clinic input]
11755 str.isalnum as unicode_isalnum
11756
11757 Return True if the string is an alpha-numeric string, False otherwise.
11758
11759 A string is alpha-numeric if all characters in the string are alpha-numeric and
11760 there is at least one character in the string.
11761 [clinic start generated code]*/
11762
11763 static PyObject *
unicode_isalnum_impl(PyObject * self)11764 unicode_isalnum_impl(PyObject *self)
11765 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11766 {
11767 int kind;
11768 const void *data;
11769 Py_ssize_t len, i;
11770
11771 kind = PyUnicode_KIND(self);
11772 data = PyUnicode_DATA(self);
11773 len = PyUnicode_GET_LENGTH(self);
11774
11775 /* Shortcut for single character strings */
11776 if (len == 1) {
11777 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11778 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11779 }
11780
11781 /* Special case for empty strings */
11782 if (len == 0)
11783 Py_RETURN_FALSE;
11784
11785 for (i = 0; i < len; i++) {
11786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11787 if (!Py_UNICODE_ISALNUM(ch))
11788 Py_RETURN_FALSE;
11789 }
11790 Py_RETURN_TRUE;
11791 }
11792
11793 /*[clinic input]
11794 str.isdecimal as unicode_isdecimal
11795
11796 Return True if the string is a decimal string, False otherwise.
11797
11798 A string is a decimal string if all characters in the string are decimal and
11799 there is at least one character in the string.
11800 [clinic start generated code]*/
11801
11802 static PyObject *
unicode_isdecimal_impl(PyObject * self)11803 unicode_isdecimal_impl(PyObject *self)
11804 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
11805 {
11806 Py_ssize_t i, length;
11807 int kind;
11808 const void *data;
11809
11810 length = PyUnicode_GET_LENGTH(self);
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
11813
11814 /* Shortcut for single character strings */
11815 if (length == 1)
11816 return PyBool_FromLong(
11817 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11818
11819 /* Special case for empty strings */
11820 if (length == 0)
11821 Py_RETURN_FALSE;
11822
11823 for (i = 0; i < length; i++) {
11824 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11825 Py_RETURN_FALSE;
11826 }
11827 Py_RETURN_TRUE;
11828 }
11829
11830 /*[clinic input]
11831 str.isdigit as unicode_isdigit
11832
11833 Return True if the string is a digit string, False otherwise.
11834
11835 A string is a digit string if all characters in the string are digits and there
11836 is at least one character in the string.
11837 [clinic start generated code]*/
11838
11839 static PyObject *
unicode_isdigit_impl(PyObject * self)11840 unicode_isdigit_impl(PyObject *self)
11841 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
11842 {
11843 Py_ssize_t i, length;
11844 int kind;
11845 const void *data;
11846
11847 length = PyUnicode_GET_LENGTH(self);
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
11850
11851 /* Shortcut for single character strings */
11852 if (length == 1) {
11853 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11854 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11855 }
11856
11857 /* Special case for empty strings */
11858 if (length == 0)
11859 Py_RETURN_FALSE;
11860
11861 for (i = 0; i < length; i++) {
11862 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11863 Py_RETURN_FALSE;
11864 }
11865 Py_RETURN_TRUE;
11866 }
11867
11868 /*[clinic input]
11869 str.isnumeric as unicode_isnumeric
11870
11871 Return True if the string is a numeric string, False otherwise.
11872
11873 A string is numeric if all characters in the string are numeric and there is at
11874 least one character in the string.
11875 [clinic start generated code]*/
11876
11877 static PyObject *
unicode_isnumeric_impl(PyObject * self)11878 unicode_isnumeric_impl(PyObject *self)
11879 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
11880 {
11881 Py_ssize_t i, length;
11882 int kind;
11883 const void *data;
11884
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
11888
11889 /* Shortcut for single character strings */
11890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11893
11894 /* Special case for empty strings */
11895 if (length == 0)
11896 Py_RETURN_FALSE;
11897
11898 for (i = 0; i < length; i++) {
11899 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11900 Py_RETURN_FALSE;
11901 }
11902 Py_RETURN_TRUE;
11903 }
11904
11905 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)11906 _PyUnicode_ScanIdentifier(PyObject *self)
11907 {
11908 Py_ssize_t i;
11909 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11910 if (len == 0) {
11911 /* an empty string is not a valid identifier */
11912 return 0;
11913 }
11914
11915 int kind = PyUnicode_KIND(self);
11916 const void *data = PyUnicode_DATA(self);
11917 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11918 /* PEP 3131 says that the first character must be in
11919 XID_Start and subsequent characters in XID_Continue,
11920 and for the ASCII range, the 2.x rules apply (i.e
11921 start with letters and underscore, continue with
11922 letters, digits, underscore). However, given the current
11923 definition of XID_Start and XID_Continue, it is sufficient
11924 to check just for these, except that _ must be allowed
11925 as starting an identifier. */
11926 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
11927 return 0;
11928 }
11929
11930 for (i = 1; i < len; i++) {
11931 ch = PyUnicode_READ(kind, data, i);
11932 if (!_PyUnicode_IsXidContinue(ch)) {
11933 return i;
11934 }
11935 }
11936 return i;
11937 }
11938
11939 int
PyUnicode_IsIdentifier(PyObject * self)11940 PyUnicode_IsIdentifier(PyObject *self)
11941 {
11942 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
11943 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11944 /* an empty string is not a valid identifier */
11945 return len && i == len;
11946 }
11947
11948 /*[clinic input]
11949 str.isidentifier as unicode_isidentifier
11950
11951 Return True if the string is a valid Python identifier, False otherwise.
11952
11953 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
11954 such as "def" or "class".
11955 [clinic start generated code]*/
11956
11957 static PyObject *
unicode_isidentifier_impl(PyObject * self)11958 unicode_isidentifier_impl(PyObject *self)
11959 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
11960 {
11961 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11962 }
11963
11964 /*[clinic input]
11965 str.isprintable as unicode_isprintable
11966
11967 Return True if the string is printable, False otherwise.
11968
11969 A string is printable if all of its characters are considered printable in
11970 repr() or if it is empty.
11971 [clinic start generated code]*/
11972
11973 static PyObject *
unicode_isprintable_impl(PyObject * self)11974 unicode_isprintable_impl(PyObject *self)
11975 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
11976 {
11977 Py_ssize_t i, length;
11978 int kind;
11979 const void *data;
11980
11981 length = PyUnicode_GET_LENGTH(self);
11982 kind = PyUnicode_KIND(self);
11983 data = PyUnicode_DATA(self);
11984
11985 /* Shortcut for single character strings */
11986 if (length == 1)
11987 return PyBool_FromLong(
11988 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11989
11990 for (i = 0; i < length; i++) {
11991 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11992 Py_RETURN_FALSE;
11993 }
11994 }
11995 Py_RETURN_TRUE;
11996 }
11997
11998 /*[clinic input]
11999 str.join as unicode_join
12000
12001 iterable: object
12002 /
12003
12004 Concatenate any number of strings.
12005
12006 The string whose method is called is inserted in between each given string.
12007 The result is returned as a new string.
12008
12009 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12010 [clinic start generated code]*/
12011
12012 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12013 unicode_join(PyObject *self, PyObject *iterable)
12014 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12015 {
12016 return PyUnicode_Join(self, iterable);
12017 }
12018
12019 static Py_ssize_t
unicode_length(PyObject * self)12020 unicode_length(PyObject *self)
12021 {
12022 return PyUnicode_GET_LENGTH(self);
12023 }
12024
12025 /*[clinic input]
12026 str.ljust as unicode_ljust
12027
12028 width: Py_ssize_t
12029 fillchar: Py_UCS4 = ' '
12030 /
12031
12032 Return a left-justified string of length width.
12033
12034 Padding is done using the specified fill character (default is a space).
12035 [clinic start generated code]*/
12036
12037 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12038 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12039 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12040 {
12041 if (PyUnicode_GET_LENGTH(self) >= width)
12042 return unicode_result_unchanged(self);
12043
12044 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12045 }
12046
12047 /*[clinic input]
12048 str.lower as unicode_lower
12049
12050 Return a copy of the string converted to lowercase.
12051 [clinic start generated code]*/
12052
12053 static PyObject *
unicode_lower_impl(PyObject * self)12054 unicode_lower_impl(PyObject *self)
12055 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12056 {
12057 if (PyUnicode_IS_ASCII(self))
12058 return ascii_upper_or_lower(self, 1);
12059 return case_operation(self, do_lower);
12060 }
12061
12062 #define LEFTSTRIP 0
12063 #define RIGHTSTRIP 1
12064 #define BOTHSTRIP 2
12065
12066 /* Arrays indexed by above */
12067 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12068
12069 #define STRIPNAME(i) (stripfuncnames[i])
12070
12071 /* externally visible for str.strip(unicode) */
12072 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12073 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12074 {
12075 const void *data;
12076 int kind;
12077 Py_ssize_t i, j, len;
12078 BLOOM_MASK sepmask;
12079 Py_ssize_t seplen;
12080
12081 kind = PyUnicode_KIND(self);
12082 data = PyUnicode_DATA(self);
12083 len = PyUnicode_GET_LENGTH(self);
12084 seplen = PyUnicode_GET_LENGTH(sepobj);
12085 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12086 PyUnicode_DATA(sepobj),
12087 seplen);
12088
12089 i = 0;
12090 if (striptype != RIGHTSTRIP) {
12091 while (i < len) {
12092 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12093 if (!BLOOM(sepmask, ch))
12094 break;
12095 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12096 break;
12097 i++;
12098 }
12099 }
12100
12101 j = len;
12102 if (striptype != LEFTSTRIP) {
12103 j--;
12104 while (j >= i) {
12105 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12106 if (!BLOOM(sepmask, ch))
12107 break;
12108 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12109 break;
12110 j--;
12111 }
12112
12113 j++;
12114 }
12115
12116 return PyUnicode_Substring(self, i, j);
12117 }
12118
12119 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12120 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12121 {
12122 const unsigned char *data;
12123 int kind;
12124 Py_ssize_t length;
12125
12126 length = PyUnicode_GET_LENGTH(self);
12127 end = Py_MIN(end, length);
12128
12129 if (start == 0 && end == length)
12130 return unicode_result_unchanged(self);
12131
12132 if (start < 0 || end < 0) {
12133 PyErr_SetString(PyExc_IndexError, "string index out of range");
12134 return NULL;
12135 }
12136 if (start >= length || end < start)
12137 _Py_RETURN_UNICODE_EMPTY();
12138
12139 length = end - start;
12140 if (PyUnicode_IS_ASCII(self)) {
12141 data = PyUnicode_1BYTE_DATA(self);
12142 return _PyUnicode_FromASCII((const char*)(data + start), length);
12143 }
12144 else {
12145 kind = PyUnicode_KIND(self);
12146 data = PyUnicode_1BYTE_DATA(self);
12147 return PyUnicode_FromKindAndData(kind,
12148 data + kind * start,
12149 length);
12150 }
12151 }
12152
12153 static PyObject *
do_strip(PyObject * self,int striptype)12154 do_strip(PyObject *self, int striptype)
12155 {
12156 Py_ssize_t len, i, j;
12157
12158 len = PyUnicode_GET_LENGTH(self);
12159
12160 if (PyUnicode_IS_ASCII(self)) {
12161 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12162
12163 i = 0;
12164 if (striptype != RIGHTSTRIP) {
12165 while (i < len) {
12166 Py_UCS1 ch = data[i];
12167 if (!_Py_ascii_whitespace[ch])
12168 break;
12169 i++;
12170 }
12171 }
12172
12173 j = len;
12174 if (striptype != LEFTSTRIP) {
12175 j--;
12176 while (j >= i) {
12177 Py_UCS1 ch = data[j];
12178 if (!_Py_ascii_whitespace[ch])
12179 break;
12180 j--;
12181 }
12182 j++;
12183 }
12184 }
12185 else {
12186 int kind = PyUnicode_KIND(self);
12187 const void *data = PyUnicode_DATA(self);
12188
12189 i = 0;
12190 if (striptype != RIGHTSTRIP) {
12191 while (i < len) {
12192 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12193 if (!Py_UNICODE_ISSPACE(ch))
12194 break;
12195 i++;
12196 }
12197 }
12198
12199 j = len;
12200 if (striptype != LEFTSTRIP) {
12201 j--;
12202 while (j >= i) {
12203 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12204 if (!Py_UNICODE_ISSPACE(ch))
12205 break;
12206 j--;
12207 }
12208 j++;
12209 }
12210 }
12211
12212 return PyUnicode_Substring(self, i, j);
12213 }
12214
12215
12216 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12217 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12218 {
12219 if (sep != Py_None) {
12220 if (PyUnicode_Check(sep))
12221 return _PyUnicode_XStrip(self, striptype, sep);
12222 else {
12223 PyErr_Format(PyExc_TypeError,
12224 "%s arg must be None or str",
12225 STRIPNAME(striptype));
12226 return NULL;
12227 }
12228 }
12229
12230 return do_strip(self, striptype);
12231 }
12232
12233
12234 /*[clinic input]
12235 str.strip as unicode_strip
12236
12237 chars: object = None
12238 /
12239
12240 Return a copy of the string with leading and trailing whitespace removed.
12241
12242 If chars is given and not None, remove characters in chars instead.
12243 [clinic start generated code]*/
12244
12245 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12246 unicode_strip_impl(PyObject *self, PyObject *chars)
12247 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12248 {
12249 return do_argstrip(self, BOTHSTRIP, chars);
12250 }
12251
12252
12253 /*[clinic input]
12254 str.lstrip as unicode_lstrip
12255
12256 chars: object = None
12257 /
12258
12259 Return a copy of the string with leading whitespace removed.
12260
12261 If chars is given and not None, remove characters in chars instead.
12262 [clinic start generated code]*/
12263
12264 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12265 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12266 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12267 {
12268 return do_argstrip(self, LEFTSTRIP, chars);
12269 }
12270
12271
12272 /*[clinic input]
12273 str.rstrip as unicode_rstrip
12274
12275 chars: object = None
12276 /
12277
12278 Return a copy of the string with trailing whitespace removed.
12279
12280 If chars is given and not None, remove characters in chars instead.
12281 [clinic start generated code]*/
12282
12283 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12284 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12285 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12286 {
12287 return do_argstrip(self, RIGHTSTRIP, chars);
12288 }
12289
12290
12291 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12292 unicode_repeat(PyObject *str, Py_ssize_t len)
12293 {
12294 PyObject *u;
12295 Py_ssize_t nchars, n;
12296
12297 if (len < 1)
12298 _Py_RETURN_UNICODE_EMPTY();
12299
12300 /* no repeat, return original string */
12301 if (len == 1)
12302 return unicode_result_unchanged(str);
12303
12304 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12305 PyErr_SetString(PyExc_OverflowError,
12306 "repeated string is too long");
12307 return NULL;
12308 }
12309 nchars = len * PyUnicode_GET_LENGTH(str);
12310
12311 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12312 if (!u)
12313 return NULL;
12314 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12315
12316 if (PyUnicode_GET_LENGTH(str) == 1) {
12317 int kind = PyUnicode_KIND(str);
12318 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12319 if (kind == PyUnicode_1BYTE_KIND) {
12320 void *to = PyUnicode_DATA(u);
12321 memset(to, (unsigned char)fill_char, len);
12322 }
12323 else if (kind == PyUnicode_2BYTE_KIND) {
12324 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12325 for (n = 0; n < len; ++n)
12326 ucs2[n] = fill_char;
12327 } else {
12328 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12329 assert(kind == PyUnicode_4BYTE_KIND);
12330 for (n = 0; n < len; ++n)
12331 ucs4[n] = fill_char;
12332 }
12333 }
12334 else {
12335 Py_ssize_t char_size = PyUnicode_KIND(str);
12336 char *to = (char *) PyUnicode_DATA(u);
12337 _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12338 PyUnicode_GET_LENGTH(str) * char_size);
12339 }
12340
12341 assert(_PyUnicode_CheckConsistency(u, 1));
12342 return u;
12343 }
12344
12345 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12346 PyUnicode_Replace(PyObject *str,
12347 PyObject *substr,
12348 PyObject *replstr,
12349 Py_ssize_t maxcount)
12350 {
12351 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12352 ensure_unicode(replstr) < 0)
12353 return NULL;
12354 return replace(str, substr, replstr, maxcount);
12355 }
12356
12357 /*[clinic input]
12358 str.replace as unicode_replace
12359
12360 old: unicode
12361 new: unicode
12362 /
12363 count: Py_ssize_t = -1
12364 Maximum number of occurrences to replace.
12365 -1 (the default value) means replace all occurrences.
12366
12367 Return a copy with all occurrences of substring old replaced by new.
12368
12369 If the optional argument count is given, only the first count occurrences are
12370 replaced.
12371 [clinic start generated code]*/
12372
12373 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12374 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12375 Py_ssize_t count)
12376 /*[clinic end generated code: output=b63f1a8b5eebf448 input=3345c455d60a5499]*/
12377 {
12378 return replace(self, old, new, count);
12379 }
12380
12381 /*[clinic input]
12382 str.removeprefix as unicode_removeprefix
12383
12384 prefix: unicode
12385 /
12386
12387 Return a str with the given prefix string removed if present.
12388
12389 If the string starts with the prefix string, return string[len(prefix):].
12390 Otherwise, return a copy of the original string.
12391 [clinic start generated code]*/
12392
12393 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12394 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12395 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12396 {
12397 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12398 if (match == -1) {
12399 return NULL;
12400 }
12401 if (match) {
12402 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12403 PyUnicode_GET_LENGTH(self));
12404 }
12405 return unicode_result_unchanged(self);
12406 }
12407
12408 /*[clinic input]
12409 str.removesuffix as unicode_removesuffix
12410
12411 suffix: unicode
12412 /
12413
12414 Return a str with the given suffix string removed if present.
12415
12416 If the string ends with the suffix string and that suffix is not empty,
12417 return string[:-len(suffix)]. Otherwise, return a copy of the original
12418 string.
12419 [clinic start generated code]*/
12420
12421 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12422 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12423 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12424 {
12425 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12426 if (match == -1) {
12427 return NULL;
12428 }
12429 if (match) {
12430 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12431 - PyUnicode_GET_LENGTH(suffix));
12432 }
12433 return unicode_result_unchanged(self);
12434 }
12435
12436 static PyObject *
unicode_repr(PyObject * unicode)12437 unicode_repr(PyObject *unicode)
12438 {
12439 PyObject *repr;
12440 Py_ssize_t isize;
12441 Py_ssize_t osize, squote, dquote, i, o;
12442 Py_UCS4 max, quote;
12443 int ikind, okind, unchanged;
12444 const void *idata;
12445 void *odata;
12446
12447 isize = PyUnicode_GET_LENGTH(unicode);
12448 idata = PyUnicode_DATA(unicode);
12449
12450 /* Compute length of output, quote characters, and
12451 maximum character */
12452 osize = 0;
12453 max = 127;
12454 squote = dquote = 0;
12455 ikind = PyUnicode_KIND(unicode);
12456 for (i = 0; i < isize; i++) {
12457 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12458 Py_ssize_t incr = 1;
12459 switch (ch) {
12460 case '\'': squote++; break;
12461 case '"': dquote++; break;
12462 case '\\': case '\t': case '\r': case '\n':
12463 incr = 2;
12464 break;
12465 default:
12466 /* Fast-path ASCII */
12467 if (ch < ' ' || ch == 0x7f)
12468 incr = 4; /* \xHH */
12469 else if (ch < 0x7f)
12470 ;
12471 else if (Py_UNICODE_ISPRINTABLE(ch))
12472 max = ch > max ? ch : max;
12473 else if (ch < 0x100)
12474 incr = 4; /* \xHH */
12475 else if (ch < 0x10000)
12476 incr = 6; /* \uHHHH */
12477 else
12478 incr = 10; /* \uHHHHHHHH */
12479 }
12480 if (osize > PY_SSIZE_T_MAX - incr) {
12481 PyErr_SetString(PyExc_OverflowError,
12482 "string is too long to generate repr");
12483 return NULL;
12484 }
12485 osize += incr;
12486 }
12487
12488 quote = '\'';
12489 unchanged = (osize == isize);
12490 if (squote) {
12491 unchanged = 0;
12492 if (dquote)
12493 /* Both squote and dquote present. Use squote,
12494 and escape them */
12495 osize += squote;
12496 else
12497 quote = '"';
12498 }
12499 osize += 2; /* quotes */
12500
12501 repr = PyUnicode_New(osize, max);
12502 if (repr == NULL)
12503 return NULL;
12504 okind = PyUnicode_KIND(repr);
12505 odata = PyUnicode_DATA(repr);
12506
12507 PyUnicode_WRITE(okind, odata, 0, quote);
12508 PyUnicode_WRITE(okind, odata, osize-1, quote);
12509 if (unchanged) {
12510 _PyUnicode_FastCopyCharacters(repr, 1,
12511 unicode, 0,
12512 isize);
12513 }
12514 else {
12515 for (i = 0, o = 1; i < isize; i++) {
12516 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12517
12518 /* Escape quotes and backslashes */
12519 if ((ch == quote) || (ch == '\\')) {
12520 PyUnicode_WRITE(okind, odata, o++, '\\');
12521 PyUnicode_WRITE(okind, odata, o++, ch);
12522 continue;
12523 }
12524
12525 /* Map special whitespace to '\t', \n', '\r' */
12526 if (ch == '\t') {
12527 PyUnicode_WRITE(okind, odata, o++, '\\');
12528 PyUnicode_WRITE(okind, odata, o++, 't');
12529 }
12530 else if (ch == '\n') {
12531 PyUnicode_WRITE(okind, odata, o++, '\\');
12532 PyUnicode_WRITE(okind, odata, o++, 'n');
12533 }
12534 else if (ch == '\r') {
12535 PyUnicode_WRITE(okind, odata, o++, '\\');
12536 PyUnicode_WRITE(okind, odata, o++, 'r');
12537 }
12538
12539 /* Map non-printable US ASCII to '\xhh' */
12540 else if (ch < ' ' || ch == 0x7F) {
12541 PyUnicode_WRITE(okind, odata, o++, '\\');
12542 PyUnicode_WRITE(okind, odata, o++, 'x');
12543 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12544 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12545 }
12546
12547 /* Copy ASCII characters as-is */
12548 else if (ch < 0x7F) {
12549 PyUnicode_WRITE(okind, odata, o++, ch);
12550 }
12551
12552 /* Non-ASCII characters */
12553 else {
12554 /* Map Unicode whitespace and control characters
12555 (categories Z* and C* except ASCII space)
12556 */
12557 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12558 PyUnicode_WRITE(okind, odata, o++, '\\');
12559 /* Map 8-bit characters to '\xhh' */
12560 if (ch <= 0xff) {
12561 PyUnicode_WRITE(okind, odata, o++, 'x');
12562 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12563 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12564 }
12565 /* Map 16-bit characters to '\uxxxx' */
12566 else if (ch <= 0xffff) {
12567 PyUnicode_WRITE(okind, odata, o++, 'u');
12568 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12569 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12570 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12571 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12572 }
12573 /* Map 21-bit characters to '\U00xxxxxx' */
12574 else {
12575 PyUnicode_WRITE(okind, odata, o++, 'U');
12576 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12577 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12578 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12579 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12580 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12581 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12582 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12583 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12584 }
12585 }
12586 /* Copy characters as-is */
12587 else {
12588 PyUnicode_WRITE(okind, odata, o++, ch);
12589 }
12590 }
12591 }
12592 }
12593 /* Closing quote already added at the beginning */
12594 assert(_PyUnicode_CheckConsistency(repr, 1));
12595 return repr;
12596 }
12597
12598 /*[clinic input]
12599 str.rfind as unicode_rfind = str.count
12600
12601 Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12602
12603 Optional arguments start and end are interpreted as in slice notation.
12604 Return -1 on failure.
12605 [clinic start generated code]*/
12606
12607 static Py_ssize_t
unicode_rfind_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)12608 unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12609 Py_ssize_t end)
12610 /*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/
12611 {
12612 Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12613 if (result < 0) {
12614 return -1;
12615 }
12616 return result;
12617 }
12618
12619 /*[clinic input]
12620 str.rindex as unicode_rindex = str.count
12621
12622 Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12623
12624 Optional arguments start and end are interpreted as in slice notation.
12625 Raises ValueError when the substring is not found.
12626 [clinic start generated code]*/
12627
12628 static Py_ssize_t
unicode_rindex_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)12629 unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12630 Py_ssize_t end)
12631 /*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/
12632 {
12633 Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12634 if (result == -1) {
12635 PyErr_SetString(PyExc_ValueError, "substring not found");
12636 }
12637 else if (result < 0) {
12638 return -1;
12639 }
12640 return result;
12641 }
12642
12643 /*[clinic input]
12644 str.rjust as unicode_rjust
12645
12646 width: Py_ssize_t
12647 fillchar: Py_UCS4 = ' '
12648 /
12649
12650 Return a right-justified string of length width.
12651
12652 Padding is done using the specified fill character (default is a space).
12653 [clinic start generated code]*/
12654
12655 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12656 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12657 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12658 {
12659 if (PyUnicode_GET_LENGTH(self) >= width)
12660 return unicode_result_unchanged(self);
12661
12662 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12663 }
12664
12665 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12666 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12667 {
12668 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12669 return NULL;
12670
12671 return split(s, sep, maxsplit);
12672 }
12673
12674 /*[clinic input]
12675 str.split as unicode_split
12676
12677 sep: object = None
12678 The separator used to split the string.
12679
12680 When set to None (the default value), will split on any whitespace
12681 character (including \n \r \t \f and spaces) and will discard
12682 empty strings from the result.
12683 maxsplit: Py_ssize_t = -1
12684 Maximum number of splits.
12685 -1 (the default value) means no limit.
12686
12687 Return a list of the substrings in the string, using sep as the separator string.
12688
12689 Splitting starts at the front of the string and works to the end.
12690
12691 Note, str.split() is mainly useful for data that has been intentionally
12692 delimited. With natural text that includes punctuation, consider using
12693 the regular expression module.
12694
12695 [clinic start generated code]*/
12696
12697 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12698 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12699 /*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
12700 {
12701 if (sep == Py_None)
12702 return split(self, NULL, maxsplit);
12703 if (PyUnicode_Check(sep))
12704 return split(self, sep, maxsplit);
12705
12706 PyErr_Format(PyExc_TypeError,
12707 "must be str or None, not %.100s",
12708 Py_TYPE(sep)->tp_name);
12709 return NULL;
12710 }
12711
12712 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12713 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12714 {
12715 PyObject* out;
12716 int kind1, kind2;
12717 const void *buf1, *buf2;
12718 Py_ssize_t len1, len2;
12719
12720 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12721 return NULL;
12722
12723 kind1 = PyUnicode_KIND(str_obj);
12724 kind2 = PyUnicode_KIND(sep_obj);
12725 len1 = PyUnicode_GET_LENGTH(str_obj);
12726 len2 = PyUnicode_GET_LENGTH(sep_obj);
12727 if (kind1 < kind2 || len1 < len2) {
12728 PyObject *empty = unicode_get_empty(); // Borrowed reference
12729 return PyTuple_Pack(3, str_obj, empty, empty);
12730 }
12731 buf1 = PyUnicode_DATA(str_obj);
12732 buf2 = PyUnicode_DATA(sep_obj);
12733 if (kind2 != kind1) {
12734 buf2 = unicode_askind(kind2, buf2, len2, kind1);
12735 if (!buf2)
12736 return NULL;
12737 }
12738
12739 switch (kind1) {
12740 case PyUnicode_1BYTE_KIND:
12741 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12742 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12743 else
12744 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12745 break;
12746 case PyUnicode_2BYTE_KIND:
12747 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12748 break;
12749 case PyUnicode_4BYTE_KIND:
12750 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12751 break;
12752 default:
12753 Py_UNREACHABLE();
12754 }
12755
12756 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12757 if (kind2 != kind1)
12758 PyMem_Free((void *)buf2);
12759
12760 return out;
12761 }
12762
12763
12764 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12765 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12766 {
12767 PyObject* out;
12768 int kind1, kind2;
12769 const void *buf1, *buf2;
12770 Py_ssize_t len1, len2;
12771
12772 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12773 return NULL;
12774
12775 kind1 = PyUnicode_KIND(str_obj);
12776 kind2 = PyUnicode_KIND(sep_obj);
12777 len1 = PyUnicode_GET_LENGTH(str_obj);
12778 len2 = PyUnicode_GET_LENGTH(sep_obj);
12779 if (kind1 < kind2 || len1 < len2) {
12780 PyObject *empty = unicode_get_empty(); // Borrowed reference
12781 return PyTuple_Pack(3, empty, empty, str_obj);
12782 }
12783 buf1 = PyUnicode_DATA(str_obj);
12784 buf2 = PyUnicode_DATA(sep_obj);
12785 if (kind2 != kind1) {
12786 buf2 = unicode_askind(kind2, buf2, len2, kind1);
12787 if (!buf2)
12788 return NULL;
12789 }
12790
12791 switch (kind1) {
12792 case PyUnicode_1BYTE_KIND:
12793 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12794 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12795 else
12796 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12797 break;
12798 case PyUnicode_2BYTE_KIND:
12799 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12800 break;
12801 case PyUnicode_4BYTE_KIND:
12802 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12803 break;
12804 default:
12805 Py_UNREACHABLE();
12806 }
12807
12808 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12809 if (kind2 != kind1)
12810 PyMem_Free((void *)buf2);
12811
12812 return out;
12813 }
12814
12815 /*[clinic input]
12816 str.partition as unicode_partition
12817
12818 sep: object
12819 /
12820
12821 Partition the string into three parts using the given separator.
12822
12823 This will search for the separator in the string. If the separator is found,
12824 returns a 3-tuple containing the part before the separator, the separator
12825 itself, and the part after it.
12826
12827 If the separator is not found, returns a 3-tuple containing the original string
12828 and two empty strings.
12829 [clinic start generated code]*/
12830
12831 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)12832 unicode_partition(PyObject *self, PyObject *sep)
12833 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
12834 {
12835 return PyUnicode_Partition(self, sep);
12836 }
12837
12838 /*[clinic input]
12839 str.rpartition as unicode_rpartition = str.partition
12840
12841 Partition the string into three parts using the given separator.
12842
12843 This will search for the separator in the string, starting at the end. If
12844 the separator is found, returns a 3-tuple containing the part before the
12845 separator, the separator itself, and the part after it.
12846
12847 If the separator is not found, returns a 3-tuple containing two empty strings
12848 and the original string.
12849 [clinic start generated code]*/
12850
12851 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)12852 unicode_rpartition(PyObject *self, PyObject *sep)
12853 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
12854 {
12855 return PyUnicode_RPartition(self, sep);
12856 }
12857
12858 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12859 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12860 {
12861 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12862 return NULL;
12863
12864 return rsplit(s, sep, maxsplit);
12865 }
12866
12867 /*[clinic input]
12868 str.rsplit as unicode_rsplit = str.split
12869
12870 Return a list of the substrings in the string, using sep as the separator string.
12871
12872 Splitting starts at the end of the string and works to the front.
12873 [clinic start generated code]*/
12874
12875 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12876 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12877 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
12878 {
12879 if (sep == Py_None)
12880 return rsplit(self, NULL, maxsplit);
12881 if (PyUnicode_Check(sep))
12882 return rsplit(self, sep, maxsplit);
12883
12884 PyErr_Format(PyExc_TypeError,
12885 "must be str or None, not %.100s",
12886 Py_TYPE(sep)->tp_name);
12887 return NULL;
12888 }
12889
12890 /*[clinic input]
12891 str.splitlines as unicode_splitlines
12892
12893 keepends: bool = False
12894
12895 Return a list of the lines in the string, breaking at line boundaries.
12896
12897 Line breaks are not included in the resulting list unless keepends is given and
12898 true.
12899 [clinic start generated code]*/
12900
12901 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)12902 unicode_splitlines_impl(PyObject *self, int keepends)
12903 /*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
12904 {
12905 return PyUnicode_Splitlines(self, keepends);
12906 }
12907
12908 static
unicode_str(PyObject * self)12909 PyObject *unicode_str(PyObject *self)
12910 {
12911 return unicode_result_unchanged(self);
12912 }
12913
12914 /*[clinic input]
12915 str.swapcase as unicode_swapcase
12916
12917 Convert uppercase characters to lowercase and lowercase characters to uppercase.
12918 [clinic start generated code]*/
12919
12920 static PyObject *
unicode_swapcase_impl(PyObject * self)12921 unicode_swapcase_impl(PyObject *self)
12922 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
12923 {
12924 return case_operation(self, do_swapcase);
12925 }
12926
12927 /*[clinic input]
12928
12929 @staticmethod
12930 str.maketrans as unicode_maketrans
12931
12932 x: object
12933
12934 y: unicode=NULL
12935
12936 z: unicode=NULL
12937
12938 /
12939
12940 Return a translation table usable for str.translate().
12941
12942 If there is only one argument, it must be a dictionary mapping Unicode
12943 ordinals (integers) or characters to Unicode ordinals, strings or None.
12944 Character keys will be then converted to ordinals.
12945 If there are two arguments, they must be strings of equal length, and
12946 in the resulting dictionary, each character in x will be mapped to the
12947 character at the same position in y. If there is a third argument, it
12948 must be a string, whose characters will be mapped to None in the result.
12949 [clinic start generated code]*/
12950
12951 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)12952 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12953 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12954 {
12955 PyObject *new = NULL, *key, *value;
12956 Py_ssize_t i = 0;
12957 int res;
12958
12959 new = PyDict_New();
12960 if (!new)
12961 return NULL;
12962 if (y != NULL) {
12963 int x_kind, y_kind, z_kind;
12964 const void *x_data, *y_data, *z_data;
12965
12966 /* x must be a string too, of equal length */
12967 if (!PyUnicode_Check(x)) {
12968 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12969 "be a string if there is a second argument");
12970 goto err;
12971 }
12972 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12973 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12974 "arguments must have equal length");
12975 goto err;
12976 }
12977 /* create entries for translating chars in x to those in y */
12978 x_kind = PyUnicode_KIND(x);
12979 y_kind = PyUnicode_KIND(y);
12980 x_data = PyUnicode_DATA(x);
12981 y_data = PyUnicode_DATA(y);
12982 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12983 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12984 if (!key)
12985 goto err;
12986 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12987 if (!value) {
12988 Py_DECREF(key);
12989 goto err;
12990 }
12991 res = PyDict_SetItem(new, key, value);
12992 Py_DECREF(key);
12993 Py_DECREF(value);
12994 if (res < 0)
12995 goto err;
12996 }
12997 /* create entries for deleting chars in z */
12998 if (z != NULL) {
12999 z_kind = PyUnicode_KIND(z);
13000 z_data = PyUnicode_DATA(z);
13001 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13002 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13003 if (!key)
13004 goto err;
13005 res = PyDict_SetItem(new, key, Py_None);
13006 Py_DECREF(key);
13007 if (res < 0)
13008 goto err;
13009 }
13010 }
13011 } else {
13012 int kind;
13013 const void *data;
13014
13015 /* x must be a dict */
13016 if (!PyDict_CheckExact(x)) {
13017 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13018 "to maketrans it must be a dict");
13019 goto err;
13020 }
13021 /* copy entries into the new dict, converting string keys to int keys */
13022 while (PyDict_Next(x, &i, &key, &value)) {
13023 if (PyUnicode_Check(key)) {
13024 /* convert string keys to integer keys */
13025 PyObject *newkey;
13026 if (PyUnicode_GET_LENGTH(key) != 1) {
13027 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13028 "table must be of length 1");
13029 goto err;
13030 }
13031 kind = PyUnicode_KIND(key);
13032 data = PyUnicode_DATA(key);
13033 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13034 if (!newkey)
13035 goto err;
13036 res = PyDict_SetItem(new, newkey, value);
13037 Py_DECREF(newkey);
13038 if (res < 0)
13039 goto err;
13040 } else if (PyLong_Check(key)) {
13041 /* just keep integer keys */
13042 if (PyDict_SetItem(new, key, value) < 0)
13043 goto err;
13044 } else {
13045 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13046 "be strings or integers");
13047 goto err;
13048 }
13049 }
13050 }
13051 return new;
13052 err:
13053 Py_DECREF(new);
13054 return NULL;
13055 }
13056
13057 /*[clinic input]
13058 str.translate as unicode_translate
13059
13060 table: object
13061 Translation table, which must be a mapping of Unicode ordinals to
13062 Unicode ordinals, strings, or None.
13063 /
13064
13065 Replace each character in the string using the given translation table.
13066
13067 The table must implement lookup/indexing via __getitem__, for instance a
13068 dictionary or list. If this operation raises LookupError, the character is
13069 left untouched. Characters mapped to None are deleted.
13070 [clinic start generated code]*/
13071
13072 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13073 unicode_translate(PyObject *self, PyObject *table)
13074 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13075 {
13076 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13077 }
13078
13079 /*[clinic input]
13080 str.upper as unicode_upper
13081
13082 Return a copy of the string converted to uppercase.
13083 [clinic start generated code]*/
13084
13085 static PyObject *
unicode_upper_impl(PyObject * self)13086 unicode_upper_impl(PyObject *self)
13087 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13088 {
13089 if (PyUnicode_IS_ASCII(self))
13090 return ascii_upper_or_lower(self, 0);
13091 return case_operation(self, do_upper);
13092 }
13093
13094 /*[clinic input]
13095 str.zfill as unicode_zfill
13096
13097 width: Py_ssize_t
13098 /
13099
13100 Pad a numeric string with zeros on the left, to fill a field of the given width.
13101
13102 The string is never truncated.
13103 [clinic start generated code]*/
13104
13105 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13106 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13107 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13108 {
13109 Py_ssize_t fill;
13110 PyObject *u;
13111 int kind;
13112 const void *data;
13113 Py_UCS4 chr;
13114
13115 if (PyUnicode_GET_LENGTH(self) >= width)
13116 return unicode_result_unchanged(self);
13117
13118 fill = width - PyUnicode_GET_LENGTH(self);
13119
13120 u = pad(self, fill, 0, '0');
13121
13122 if (u == NULL)
13123 return NULL;
13124
13125 kind = PyUnicode_KIND(u);
13126 data = PyUnicode_DATA(u);
13127 chr = PyUnicode_READ(kind, data, fill);
13128
13129 if (chr == '+' || chr == '-') {
13130 /* move sign to beginning of string */
13131 PyUnicode_WRITE(kind, data, 0, chr);
13132 PyUnicode_WRITE(kind, data, fill, '0');
13133 }
13134
13135 assert(_PyUnicode_CheckConsistency(u, 1));
13136 return u;
13137 }
13138
13139 /*[clinic input]
13140 @text_signature "($self, prefix[, start[, end]], /)"
13141 str.startswith as unicode_startswith
13142
13143 prefix as subobj: object
13144 A string or a tuple of strings to try.
13145 start: slice_index(accept={int, NoneType}, c_default='0') = None
13146 Optional start position. Default: start of the string.
13147 end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13148 Optional stop position. Default: end of the string.
13149 /
13150
13151 Return True if the string starts with the specified prefix, False otherwise.
13152 [clinic start generated code]*/
13153
13154 static PyObject *
unicode_startswith_impl(PyObject * self,PyObject * subobj,Py_ssize_t start,Py_ssize_t end)13155 unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13156 Py_ssize_t end)
13157 /*[clinic end generated code: output=4bd7cfd0803051d4 input=5f918b5f5f89d856]*/
13158 {
13159 if (PyTuple_Check(subobj)) {
13160 Py_ssize_t i;
13161 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13162 PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13163 if (!PyUnicode_Check(substring)) {
13164 PyErr_Format(PyExc_TypeError,
13165 "tuple for startswith must only contain str, "
13166 "not %.100s",
13167 Py_TYPE(substring)->tp_name);
13168 return NULL;
13169 }
13170 int result = tailmatch(self, substring, start, end, -1);
13171 if (result < 0) {
13172 return NULL;
13173 }
13174 if (result) {
13175 Py_RETURN_TRUE;
13176 }
13177 }
13178 /* nothing matched */
13179 Py_RETURN_FALSE;
13180 }
13181 if (!PyUnicode_Check(subobj)) {
13182 PyErr_Format(PyExc_TypeError,
13183 "startswith first arg must be str or "
13184 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13185 return NULL;
13186 }
13187 int result = tailmatch(self, subobj, start, end, -1);
13188 if (result < 0) {
13189 return NULL;
13190 }
13191 return PyBool_FromLong(result);
13192 }
13193
13194
13195 /*[clinic input]
13196 @text_signature "($self, suffix[, start[, end]], /)"
13197 str.endswith as unicode_endswith
13198
13199 suffix as subobj: object
13200 A string or a tuple of strings to try.
13201 start: slice_index(accept={int, NoneType}, c_default='0') = None
13202 Optional start position. Default: start of the string.
13203 end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13204 Optional stop position. Default: end of the string.
13205 /
13206
13207 Return True if the string ends with the specified suffix, False otherwise.
13208 [clinic start generated code]*/
13209
13210 static PyObject *
unicode_endswith_impl(PyObject * self,PyObject * subobj,Py_ssize_t start,Py_ssize_t end)13211 unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13212 Py_ssize_t end)
13213 /*[clinic end generated code: output=cce6f8ceb0102ca9 input=00fbdc774a7d4d71]*/
13214 {
13215 if (PyTuple_Check(subobj)) {
13216 Py_ssize_t i;
13217 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13218 PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13219 if (!PyUnicode_Check(substring)) {
13220 PyErr_Format(PyExc_TypeError,
13221 "tuple for endswith must only contain str, "
13222 "not %.100s",
13223 Py_TYPE(substring)->tp_name);
13224 return NULL;
13225 }
13226 int result = tailmatch(self, substring, start, end, +1);
13227 if (result < 0) {
13228 return NULL;
13229 }
13230 if (result) {
13231 Py_RETURN_TRUE;
13232 }
13233 }
13234 Py_RETURN_FALSE;
13235 }
13236 if (!PyUnicode_Check(subobj)) {
13237 PyErr_Format(PyExc_TypeError,
13238 "endswith first arg must be str or "
13239 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13240 return NULL;
13241 }
13242 int result = tailmatch(self, subobj, start, end, +1);
13243 if (result < 0) {
13244 return NULL;
13245 }
13246 return PyBool_FromLong(result);
13247 }
13248
13249 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13250 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13251 {
13252 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13253 writer->data = PyUnicode_DATA(writer->buffer);
13254
13255 if (!writer->readonly) {
13256 writer->kind = PyUnicode_KIND(writer->buffer);
13257 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13258 }
13259 else {
13260 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13261 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13262 writer->kind = 0;
13263 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13264
13265 /* Copy-on-write mode: set buffer size to 0 so
13266 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13267 * next write. */
13268 writer->size = 0;
13269 }
13270 }
13271
13272 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13273 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13274 {
13275 memset(writer, 0, sizeof(*writer));
13276
13277 /* ASCII is the bare minimum */
13278 writer->min_char = 127;
13279
13280 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13281 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13282 writer->kind = 0;
13283 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13284 }
13285
13286 // Initialize _PyUnicodeWriter with initial buffer
13287 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13288 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13289 {
13290 memset(writer, 0, sizeof(*writer));
13291 writer->buffer = buffer;
13292 _PyUnicodeWriter_Update(writer);
13293 writer->min_length = writer->size;
13294 }
13295
13296 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13297 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13298 Py_ssize_t length, Py_UCS4 maxchar)
13299 {
13300 Py_ssize_t newlen;
13301 PyObject *newbuffer;
13302
13303 assert(maxchar <= MAX_UNICODE);
13304
13305 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13306 assert((maxchar > writer->maxchar && length >= 0)
13307 || length > 0);
13308
13309 if (length > PY_SSIZE_T_MAX - writer->pos) {
13310 PyErr_NoMemory();
13311 return -1;
13312 }
13313 newlen = writer->pos + length;
13314
13315 maxchar = Py_MAX(maxchar, writer->min_char);
13316
13317 if (writer->buffer == NULL) {
13318 assert(!writer->readonly);
13319 if (writer->overallocate
13320 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13321 /* overallocate to limit the number of realloc() */
13322 newlen += newlen / OVERALLOCATE_FACTOR;
13323 }
13324 if (newlen < writer->min_length)
13325 newlen = writer->min_length;
13326
13327 writer->buffer = PyUnicode_New(newlen, maxchar);
13328 if (writer->buffer == NULL)
13329 return -1;
13330 }
13331 else if (newlen > writer->size) {
13332 if (writer->overallocate
13333 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13334 /* overallocate to limit the number of realloc() */
13335 newlen += newlen / OVERALLOCATE_FACTOR;
13336 }
13337 if (newlen < writer->min_length)
13338 newlen = writer->min_length;
13339
13340 if (maxchar > writer->maxchar || writer->readonly) {
13341 /* resize + widen */
13342 maxchar = Py_MAX(maxchar, writer->maxchar);
13343 newbuffer = PyUnicode_New(newlen, maxchar);
13344 if (newbuffer == NULL)
13345 return -1;
13346 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13347 writer->buffer, 0, writer->pos);
13348 Py_DECREF(writer->buffer);
13349 writer->readonly = 0;
13350 }
13351 else {
13352 newbuffer = resize_compact(writer->buffer, newlen);
13353 if (newbuffer == NULL)
13354 return -1;
13355 }
13356 writer->buffer = newbuffer;
13357 }
13358 else if (maxchar > writer->maxchar) {
13359 assert(!writer->readonly);
13360 newbuffer = PyUnicode_New(writer->size, maxchar);
13361 if (newbuffer == NULL)
13362 return -1;
13363 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13364 writer->buffer, 0, writer->pos);
13365 Py_SETREF(writer->buffer, newbuffer);
13366 }
13367 _PyUnicodeWriter_Update(writer);
13368 return 0;
13369
13370 #undef OVERALLOCATE_FACTOR
13371 }
13372
13373 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,int kind)13374 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13375 int kind)
13376 {
13377 Py_UCS4 maxchar;
13378
13379 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13380 assert(writer->kind < kind);
13381
13382 switch (kind)
13383 {
13384 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13385 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13386 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13387 default:
13388 Py_UNREACHABLE();
13389 }
13390
13391 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13392 }
13393
13394 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13395 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13396 {
13397 assert(ch <= MAX_UNICODE);
13398 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13399 return -1;
13400 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13401 writer->pos++;
13402 return 0;
13403 }
13404
13405 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13406 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13407 {
13408 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13409 }
13410
13411 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13412 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13413 {
13414 Py_UCS4 maxchar;
13415 Py_ssize_t len;
13416
13417 len = PyUnicode_GET_LENGTH(str);
13418 if (len == 0)
13419 return 0;
13420 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13421 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13422 if (writer->buffer == NULL && !writer->overallocate) {
13423 assert(_PyUnicode_CheckConsistency(str, 1));
13424 writer->readonly = 1;
13425 writer->buffer = Py_NewRef(str);
13426 _PyUnicodeWriter_Update(writer);
13427 writer->pos += len;
13428 return 0;
13429 }
13430 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13431 return -1;
13432 }
13433 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13434 str, 0, len);
13435 writer->pos += len;
13436 return 0;
13437 }
13438
13439 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13440 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13441 Py_ssize_t start, Py_ssize_t end)
13442 {
13443 Py_UCS4 maxchar;
13444 Py_ssize_t len;
13445
13446 assert(0 <= start);
13447 assert(end <= PyUnicode_GET_LENGTH(str));
13448 assert(start <= end);
13449
13450 if (end == 0)
13451 return 0;
13452
13453 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13454 return _PyUnicodeWriter_WriteStr(writer, str);
13455
13456 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13457 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13458 else
13459 maxchar = writer->maxchar;
13460 len = end - start;
13461
13462 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13463 return -1;
13464
13465 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13466 str, start, len);
13467 writer->pos += len;
13468 return 0;
13469 }
13470
13471 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13472 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13473 const char *ascii, Py_ssize_t len)
13474 {
13475 if (len == -1)
13476 len = strlen(ascii);
13477
13478 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13479
13480 if (writer->buffer == NULL && !writer->overallocate) {
13481 PyObject *str;
13482
13483 str = _PyUnicode_FromASCII(ascii, len);
13484 if (str == NULL)
13485 return -1;
13486
13487 writer->readonly = 1;
13488 writer->buffer = str;
13489 _PyUnicodeWriter_Update(writer);
13490 writer->pos += len;
13491 return 0;
13492 }
13493
13494 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13495 return -1;
13496
13497 switch (writer->kind)
13498 {
13499 case PyUnicode_1BYTE_KIND:
13500 {
13501 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13502 Py_UCS1 *data = writer->data;
13503
13504 memcpy(data + writer->pos, str, len);
13505 break;
13506 }
13507 case PyUnicode_2BYTE_KIND:
13508 {
13509 _PyUnicode_CONVERT_BYTES(
13510 Py_UCS1, Py_UCS2,
13511 ascii, ascii + len,
13512 (Py_UCS2 *)writer->data + writer->pos);
13513 break;
13514 }
13515 case PyUnicode_4BYTE_KIND:
13516 {
13517 _PyUnicode_CONVERT_BYTES(
13518 Py_UCS1, Py_UCS4,
13519 ascii, ascii + len,
13520 (Py_UCS4 *)writer->data + writer->pos);
13521 break;
13522 }
13523 default:
13524 Py_UNREACHABLE();
13525 }
13526
13527 writer->pos += len;
13528 return 0;
13529 }
13530
13531 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13532 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13533 const char *str, Py_ssize_t len)
13534 {
13535 Py_UCS4 maxchar;
13536
13537 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13538 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13539 return -1;
13540 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13541 writer->pos += len;
13542 return 0;
13543 }
13544
13545 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13546 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13547 {
13548 PyObject *str;
13549
13550 if (writer->pos == 0) {
13551 Py_CLEAR(writer->buffer);
13552 _Py_RETURN_UNICODE_EMPTY();
13553 }
13554
13555 str = writer->buffer;
13556 writer->buffer = NULL;
13557
13558 if (writer->readonly) {
13559 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13560 return str;
13561 }
13562
13563 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13564 PyObject *str2;
13565 str2 = resize_compact(str, writer->pos);
13566 if (str2 == NULL) {
13567 Py_DECREF(str);
13568 return NULL;
13569 }
13570 str = str2;
13571 }
13572
13573 assert(_PyUnicode_CheckConsistency(str, 1));
13574 return unicode_result(str);
13575 }
13576
13577 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13578 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13579 {
13580 Py_CLEAR(writer->buffer);
13581 }
13582
13583 #include "stringlib/unicode_format.h"
13584
13585 PyDoc_STRVAR(format__doc__,
13586 "format($self, /, *args, **kwargs)\n\
13587 --\n\
13588 \n\
13589 Return a formatted version of the string, using substitutions from args and kwargs.\n\
13590 The substitutions are identified by braces ('{' and '}').");
13591
13592 PyDoc_STRVAR(format_map__doc__,
13593 "format_map($self, mapping, /)\n\
13594 --\n\
13595 \n\
13596 Return a formatted version of the string, using substitutions from mapping.\n\
13597 The substitutions are identified by braces ('{' and '}').");
13598
13599 /*[clinic input]
13600 str.__format__ as unicode___format__
13601
13602 format_spec: unicode
13603 /
13604
13605 Return a formatted version of the string as described by format_spec.
13606 [clinic start generated code]*/
13607
13608 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13609 unicode___format___impl(PyObject *self, PyObject *format_spec)
13610 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13611 {
13612 _PyUnicodeWriter writer;
13613 int ret;
13614
13615 _PyUnicodeWriter_Init(&writer);
13616 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13617 self, format_spec, 0,
13618 PyUnicode_GET_LENGTH(format_spec));
13619 if (ret == -1) {
13620 _PyUnicodeWriter_Dealloc(&writer);
13621 return NULL;
13622 }
13623 return _PyUnicodeWriter_Finish(&writer);
13624 }
13625
13626 /*[clinic input]
13627 str.__sizeof__ as unicode_sizeof
13628
13629 Return the size of the string in memory, in bytes.
13630 [clinic start generated code]*/
13631
13632 static PyObject *
unicode_sizeof_impl(PyObject * self)13633 unicode_sizeof_impl(PyObject *self)
13634 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13635 {
13636 Py_ssize_t size;
13637
13638 /* If it's a compact object, account for base structure +
13639 character data. */
13640 if (PyUnicode_IS_COMPACT_ASCII(self)) {
13641 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13642 }
13643 else if (PyUnicode_IS_COMPACT(self)) {
13644 size = sizeof(PyCompactUnicodeObject) +
13645 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13646 }
13647 else {
13648 /* If it is a two-block object, account for base object, and
13649 for character block if present. */
13650 size = sizeof(PyUnicodeObject);
13651 if (_PyUnicode_DATA_ANY(self))
13652 size += (PyUnicode_GET_LENGTH(self) + 1) *
13653 PyUnicode_KIND(self);
13654 }
13655 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13656 size += PyUnicode_UTF8_LENGTH(self) + 1;
13657
13658 return PyLong_FromSsize_t(size);
13659 }
13660
13661 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13662 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13663 {
13664 PyObject *copy = _PyUnicode_Copy(v);
13665 if (!copy)
13666 return NULL;
13667 return Py_BuildValue("(N)", copy);
13668 }
13669
13670 static PyMethodDef unicode_methods[] = {
13671 UNICODE_ENCODE_METHODDEF
13672 UNICODE_REPLACE_METHODDEF
13673 UNICODE_SPLIT_METHODDEF
13674 UNICODE_RSPLIT_METHODDEF
13675 UNICODE_JOIN_METHODDEF
13676 UNICODE_CAPITALIZE_METHODDEF
13677 UNICODE_CASEFOLD_METHODDEF
13678 UNICODE_TITLE_METHODDEF
13679 UNICODE_CENTER_METHODDEF
13680 UNICODE_COUNT_METHODDEF
13681 UNICODE_EXPANDTABS_METHODDEF
13682 UNICODE_FIND_METHODDEF
13683 UNICODE_PARTITION_METHODDEF
13684 UNICODE_INDEX_METHODDEF
13685 UNICODE_LJUST_METHODDEF
13686 UNICODE_LOWER_METHODDEF
13687 UNICODE_LSTRIP_METHODDEF
13688 UNICODE_RFIND_METHODDEF
13689 UNICODE_RINDEX_METHODDEF
13690 UNICODE_RJUST_METHODDEF
13691 UNICODE_RSTRIP_METHODDEF
13692 UNICODE_RPARTITION_METHODDEF
13693 UNICODE_SPLITLINES_METHODDEF
13694 UNICODE_STRIP_METHODDEF
13695 UNICODE_SWAPCASE_METHODDEF
13696 UNICODE_TRANSLATE_METHODDEF
13697 UNICODE_UPPER_METHODDEF
13698 UNICODE_STARTSWITH_METHODDEF
13699 UNICODE_ENDSWITH_METHODDEF
13700 UNICODE_REMOVEPREFIX_METHODDEF
13701 UNICODE_REMOVESUFFIX_METHODDEF
13702 UNICODE_ISASCII_METHODDEF
13703 UNICODE_ISLOWER_METHODDEF
13704 UNICODE_ISUPPER_METHODDEF
13705 UNICODE_ISTITLE_METHODDEF
13706 UNICODE_ISSPACE_METHODDEF
13707 UNICODE_ISDECIMAL_METHODDEF
13708 UNICODE_ISDIGIT_METHODDEF
13709 UNICODE_ISNUMERIC_METHODDEF
13710 UNICODE_ISALPHA_METHODDEF
13711 UNICODE_ISALNUM_METHODDEF
13712 UNICODE_ISIDENTIFIER_METHODDEF
13713 UNICODE_ISPRINTABLE_METHODDEF
13714 UNICODE_ZFILL_METHODDEF
13715 {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13716 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13717 UNICODE___FORMAT___METHODDEF
13718 UNICODE_MAKETRANS_METHODDEF
13719 UNICODE_SIZEOF_METHODDEF
13720 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
13721 {NULL, NULL}
13722 };
13723
13724 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13725 unicode_mod(PyObject *v, PyObject *w)
13726 {
13727 if (!PyUnicode_Check(v))
13728 Py_RETURN_NOTIMPLEMENTED;
13729 return PyUnicode_Format(v, w);
13730 }
13731
13732 static PyNumberMethods unicode_as_number = {
13733 0, /*nb_add*/
13734 0, /*nb_subtract*/
13735 0, /*nb_multiply*/
13736 unicode_mod, /*nb_remainder*/
13737 };
13738
13739 static PySequenceMethods unicode_as_sequence = {
13740 (lenfunc) unicode_length, /* sq_length */
13741 PyUnicode_Concat, /* sq_concat */
13742 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13743 (ssizeargfunc) unicode_getitem, /* sq_item */
13744 0, /* sq_slice */
13745 0, /* sq_ass_item */
13746 0, /* sq_ass_slice */
13747 PyUnicode_Contains, /* sq_contains */
13748 };
13749
13750 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13751 unicode_subscript(PyObject* self, PyObject* item)
13752 {
13753 if (_PyIndex_Check(item)) {
13754 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13755 if (i == -1 && PyErr_Occurred())
13756 return NULL;
13757 if (i < 0)
13758 i += PyUnicode_GET_LENGTH(self);
13759 return unicode_getitem(self, i);
13760 } else if (PySlice_Check(item)) {
13761 Py_ssize_t start, stop, step, slicelength, i;
13762 size_t cur;
13763 PyObject *result;
13764 const void *src_data;
13765 void *dest_data;
13766 int src_kind, dest_kind;
13767 Py_UCS4 ch, max_char, kind_limit;
13768
13769 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13770 return NULL;
13771 }
13772 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13773 &start, &stop, step);
13774
13775 if (slicelength <= 0) {
13776 _Py_RETURN_UNICODE_EMPTY();
13777 } else if (start == 0 && step == 1 &&
13778 slicelength == PyUnicode_GET_LENGTH(self)) {
13779 return unicode_result_unchanged(self);
13780 } else if (step == 1) {
13781 return PyUnicode_Substring(self,
13782 start, start + slicelength);
13783 }
13784 /* General case */
13785 src_kind = PyUnicode_KIND(self);
13786 src_data = PyUnicode_DATA(self);
13787 if (!PyUnicode_IS_ASCII(self)) {
13788 kind_limit = kind_maxchar_limit(src_kind);
13789 max_char = 0;
13790 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13791 ch = PyUnicode_READ(src_kind, src_data, cur);
13792 if (ch > max_char) {
13793 max_char = ch;
13794 if (max_char >= kind_limit)
13795 break;
13796 }
13797 }
13798 }
13799 else
13800 max_char = 127;
13801 result = PyUnicode_New(slicelength, max_char);
13802 if (result == NULL)
13803 return NULL;
13804 dest_kind = PyUnicode_KIND(result);
13805 dest_data = PyUnicode_DATA(result);
13806
13807 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13808 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13809 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13810 }
13811 assert(_PyUnicode_CheckConsistency(result, 1));
13812 return result;
13813 } else {
13814 PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13815 Py_TYPE(item)->tp_name);
13816 return NULL;
13817 }
13818 }
13819
13820 static PyMappingMethods unicode_as_mapping = {
13821 (lenfunc)unicode_length, /* mp_length */
13822 (binaryfunc)unicode_subscript, /* mp_subscript */
13823 (objobjargproc)0, /* mp_ass_subscript */
13824 };
13825
13826
13827 /* Helpers for PyUnicode_Format() */
13828
13829 struct unicode_formatter_t {
13830 PyObject *args;
13831 int args_owned;
13832 Py_ssize_t arglen, argidx;
13833 PyObject *dict;
13834
13835 int fmtkind;
13836 Py_ssize_t fmtcnt, fmtpos;
13837 const void *fmtdata;
13838 PyObject *fmtstr;
13839
13840 _PyUnicodeWriter writer;
13841 };
13842
13843 struct unicode_format_arg_t {
13844 Py_UCS4 ch;
13845 int flags;
13846 Py_ssize_t width;
13847 int prec;
13848 int sign;
13849 };
13850
13851 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)13852 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13853 {
13854 Py_ssize_t argidx = ctx->argidx;
13855
13856 if (argidx < ctx->arglen) {
13857 ctx->argidx++;
13858 if (ctx->arglen < 0)
13859 return ctx->args;
13860 else
13861 return PyTuple_GetItem(ctx->args, argidx);
13862 }
13863 PyErr_SetString(PyExc_TypeError,
13864 "not enough arguments for format string");
13865 return NULL;
13866 }
13867
13868 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
13869
13870 /* Format a float into the writer if the writer is not NULL, or into *p_output
13871 otherwise.
13872
13873 Return 0 on success, raise an exception and return -1 on error. */
13874 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)13875 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13876 PyObject **p_output,
13877 _PyUnicodeWriter *writer)
13878 {
13879 char *p;
13880 double x;
13881 Py_ssize_t len;
13882 int prec;
13883 int dtoa_flags = 0;
13884
13885 x = PyFloat_AsDouble(v);
13886 if (x == -1.0 && PyErr_Occurred())
13887 return -1;
13888
13889 prec = arg->prec;
13890 if (prec < 0)
13891 prec = 6;
13892
13893 if (arg->flags & F_ALT)
13894 dtoa_flags |= Py_DTSF_ALT;
13895 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13896 if (p == NULL)
13897 return -1;
13898 len = strlen(p);
13899 if (writer) {
13900 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13901 PyMem_Free(p);
13902 return -1;
13903 }
13904 }
13905 else
13906 *p_output = _PyUnicode_FromASCII(p, len);
13907 PyMem_Free(p);
13908 return 0;
13909 }
13910
13911 /* formatlong() emulates the format codes d, u, o, x and X, and
13912 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13913 * Python's regular ints.
13914 * Return value: a new PyUnicodeObject*, or NULL if error.
13915 * The output string is of the form
13916 * "-"? ("0x" | "0X")? digit+
13917 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13918 * set in flags. The case of hex digits will be correct,
13919 * There will be at least prec digits, zero-filled on the left if
13920 * necessary to get that many.
13921 * val object to be converted
13922 * flags bitmask of format flags; only F_ALT is looked at
13923 * prec minimum number of digits; 0-fill on left if needed
13924 * type a character in [duoxX]; u acts the same as d
13925 *
13926 * CAUTION: o, x and X conversions on regular ints can never
13927 * produce a '-' sign, but can for Python's unbounded ints.
13928 */
13929 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)13930 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13931 {
13932 PyObject *result = NULL;
13933 char *buf;
13934 Py_ssize_t i;
13935 int sign; /* 1 if '-', else 0 */
13936 int len; /* number of characters */
13937 Py_ssize_t llen;
13938 int numdigits; /* len == numnondigits + numdigits */
13939 int numnondigits = 0;
13940
13941 /* Avoid exceeding SSIZE_T_MAX */
13942 if (prec > INT_MAX-3) {
13943 PyErr_SetString(PyExc_OverflowError,
13944 "precision too large");
13945 return NULL;
13946 }
13947
13948 assert(PyLong_Check(val));
13949
13950 switch (type) {
13951 default:
13952 Py_UNREACHABLE();
13953 case 'd':
13954 case 'i':
13955 case 'u':
13956 /* int and int subclasses should print numerically when a numeric */
13957 /* format code is used (see issue18780) */
13958 result = PyNumber_ToBase(val, 10);
13959 break;
13960 case 'o':
13961 numnondigits = 2;
13962 result = PyNumber_ToBase(val, 8);
13963 break;
13964 case 'x':
13965 case 'X':
13966 numnondigits = 2;
13967 result = PyNumber_ToBase(val, 16);
13968 break;
13969 }
13970 if (!result)
13971 return NULL;
13972
13973 assert(unicode_modifiable(result));
13974 assert(PyUnicode_IS_ASCII(result));
13975
13976 /* To modify the string in-place, there can only be one reference. */
13977 if (Py_REFCNT(result) != 1) {
13978 Py_DECREF(result);
13979 PyErr_BadInternalCall();
13980 return NULL;
13981 }
13982 buf = PyUnicode_DATA(result);
13983 llen = PyUnicode_GET_LENGTH(result);
13984 if (llen > INT_MAX) {
13985 Py_DECREF(result);
13986 PyErr_SetString(PyExc_ValueError,
13987 "string too large in _PyUnicode_FormatLong");
13988 return NULL;
13989 }
13990 len = (int)llen;
13991 sign = buf[0] == '-';
13992 numnondigits += sign;
13993 numdigits = len - numnondigits;
13994 assert(numdigits > 0);
13995
13996 /* Get rid of base marker unless F_ALT */
13997 if (((alt) == 0 &&
13998 (type == 'o' || type == 'x' || type == 'X'))) {
13999 assert(buf[sign] == '0');
14000 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14001 buf[sign+1] == 'o');
14002 numnondigits -= 2;
14003 buf += 2;
14004 len -= 2;
14005 if (sign)
14006 buf[0] = '-';
14007 assert(len == numnondigits + numdigits);
14008 assert(numdigits > 0);
14009 }
14010
14011 /* Fill with leading zeroes to meet minimum width. */
14012 if (prec > numdigits) {
14013 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14014 numnondigits + prec);
14015 char *b1;
14016 if (!r1) {
14017 Py_DECREF(result);
14018 return NULL;
14019 }
14020 b1 = PyBytes_AS_STRING(r1);
14021 for (i = 0; i < numnondigits; ++i)
14022 *b1++ = *buf++;
14023 for (i = 0; i < prec - numdigits; i++)
14024 *b1++ = '0';
14025 for (i = 0; i < numdigits; i++)
14026 *b1++ = *buf++;
14027 *b1 = '\0';
14028 Py_SETREF(result, r1);
14029 buf = PyBytes_AS_STRING(result);
14030 len = numnondigits + prec;
14031 }
14032
14033 /* Fix up case for hex conversions. */
14034 if (type == 'X') {
14035 /* Need to convert all lower case letters to upper case.
14036 and need to convert 0x to 0X (and -0x to -0X). */
14037 for (i = 0; i < len; i++)
14038 if (buf[i] >= 'a' && buf[i] <= 'x')
14039 buf[i] -= 'a'-'A';
14040 }
14041 if (!PyUnicode_Check(result)
14042 || buf != PyUnicode_DATA(result)) {
14043 PyObject *unicode;
14044 unicode = _PyUnicode_FromASCII(buf, len);
14045 Py_SETREF(result, unicode);
14046 }
14047 else if (len != PyUnicode_GET_LENGTH(result)) {
14048 if (PyUnicode_Resize(&result, len) < 0)
14049 Py_CLEAR(result);
14050 }
14051 return result;
14052 }
14053
14054 /* Format an integer or a float as an integer.
14055 * Return 1 if the number has been formatted into the writer,
14056 * 0 if the number has been formatted into *p_output
14057 * -1 and raise an exception on error */
14058 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14059 mainformatlong(PyObject *v,
14060 struct unicode_format_arg_t *arg,
14061 PyObject **p_output,
14062 _PyUnicodeWriter *writer)
14063 {
14064 PyObject *iobj, *res;
14065 char type = (char)arg->ch;
14066
14067 if (!PyNumber_Check(v))
14068 goto wrongtype;
14069
14070 /* make sure number is a type of integer for o, x, and X */
14071 if (!PyLong_Check(v)) {
14072 if (type == 'o' || type == 'x' || type == 'X') {
14073 iobj = _PyNumber_Index(v);
14074 }
14075 else {
14076 iobj = PyNumber_Long(v);
14077 }
14078 if (iobj == NULL ) {
14079 if (PyErr_ExceptionMatches(PyExc_TypeError))
14080 goto wrongtype;
14081 return -1;
14082 }
14083 assert(PyLong_Check(iobj));
14084 }
14085 else {
14086 iobj = Py_NewRef(v);
14087 }
14088
14089 if (PyLong_CheckExact(v)
14090 && arg->width == -1 && arg->prec == -1
14091 && !(arg->flags & (F_SIGN | F_BLANK))
14092 && type != 'X')
14093 {
14094 /* Fast path */
14095 int alternate = arg->flags & F_ALT;
14096 int base;
14097
14098 switch(type)
14099 {
14100 default:
14101 Py_UNREACHABLE();
14102 case 'd':
14103 case 'i':
14104 case 'u':
14105 base = 10;
14106 break;
14107 case 'o':
14108 base = 8;
14109 break;
14110 case 'x':
14111 case 'X':
14112 base = 16;
14113 break;
14114 }
14115
14116 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14117 Py_DECREF(iobj);
14118 return -1;
14119 }
14120 Py_DECREF(iobj);
14121 return 1;
14122 }
14123
14124 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14125 Py_DECREF(iobj);
14126 if (res == NULL)
14127 return -1;
14128 *p_output = res;
14129 return 0;
14130
14131 wrongtype:
14132 switch(type)
14133 {
14134 case 'o':
14135 case 'x':
14136 case 'X':
14137 PyErr_Format(PyExc_TypeError,
14138 "%%%c format: an integer is required, "
14139 "not %.200s",
14140 type, Py_TYPE(v)->tp_name);
14141 break;
14142 default:
14143 PyErr_Format(PyExc_TypeError,
14144 "%%%c format: a real number is required, "
14145 "not %.200s",
14146 type, Py_TYPE(v)->tp_name);
14147 break;
14148 }
14149 return -1;
14150 }
14151
14152 static Py_UCS4
formatchar(PyObject * v)14153 formatchar(PyObject *v)
14154 {
14155 /* presume that the buffer is at least 3 characters long */
14156 if (PyUnicode_Check(v)) {
14157 if (PyUnicode_GET_LENGTH(v) == 1) {
14158 return PyUnicode_READ_CHAR(v, 0);
14159 }
14160 goto onError;
14161 }
14162 else {
14163 int overflow;
14164 long x = PyLong_AsLongAndOverflow(v, &overflow);
14165 if (x == -1 && PyErr_Occurred()) {
14166 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14167 goto onError;
14168 }
14169 return (Py_UCS4) -1;
14170 }
14171
14172 if (x < 0 || x > MAX_UNICODE) {
14173 /* this includes an overflow in converting to C long */
14174 PyErr_SetString(PyExc_OverflowError,
14175 "%c arg not in range(0x110000)");
14176 return (Py_UCS4) -1;
14177 }
14178
14179 return (Py_UCS4) x;
14180 }
14181
14182 onError:
14183 PyErr_SetString(PyExc_TypeError,
14184 "%c requires int or char");
14185 return (Py_UCS4) -1;
14186 }
14187
14188 /* Parse options of an argument: flags, width, precision.
14189 Handle also "%(name)" syntax.
14190
14191 Return 0 if the argument has been formatted into arg->str.
14192 Return 1 if the argument has been written into ctx->writer,
14193 Raise an exception and return -1 on error. */
14194 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14195 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14196 struct unicode_format_arg_t *arg)
14197 {
14198 #define FORMAT_READ(ctx) \
14199 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14200
14201 PyObject *v;
14202
14203 if (arg->ch == '(') {
14204 /* Get argument value from a dictionary. Example: "%(name)s". */
14205 Py_ssize_t keystart;
14206 Py_ssize_t keylen;
14207 PyObject *key;
14208 int pcount = 1;
14209
14210 if (ctx->dict == NULL) {
14211 PyErr_SetString(PyExc_TypeError,
14212 "format requires a mapping");
14213 return -1;
14214 }
14215 ++ctx->fmtpos;
14216 --ctx->fmtcnt;
14217 keystart = ctx->fmtpos;
14218 /* Skip over balanced parentheses */
14219 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14220 arg->ch = FORMAT_READ(ctx);
14221 if (arg->ch == ')')
14222 --pcount;
14223 else if (arg->ch == '(')
14224 ++pcount;
14225 ctx->fmtpos++;
14226 }
14227 keylen = ctx->fmtpos - keystart - 1;
14228 if (ctx->fmtcnt < 0 || pcount > 0) {
14229 PyErr_SetString(PyExc_ValueError,
14230 "incomplete format key");
14231 return -1;
14232 }
14233 key = PyUnicode_Substring(ctx->fmtstr,
14234 keystart, keystart + keylen);
14235 if (key == NULL)
14236 return -1;
14237 if (ctx->args_owned) {
14238 ctx->args_owned = 0;
14239 Py_DECREF(ctx->args);
14240 }
14241 ctx->args = PyObject_GetItem(ctx->dict, key);
14242 Py_DECREF(key);
14243 if (ctx->args == NULL)
14244 return -1;
14245 ctx->args_owned = 1;
14246 ctx->arglen = -1;
14247 ctx->argidx = -2;
14248 }
14249
14250 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14251 while (--ctx->fmtcnt >= 0) {
14252 arg->ch = FORMAT_READ(ctx);
14253 ctx->fmtpos++;
14254 switch (arg->ch) {
14255 case '-': arg->flags |= F_LJUST; continue;
14256 case '+': arg->flags |= F_SIGN; continue;
14257 case ' ': arg->flags |= F_BLANK; continue;
14258 case '#': arg->flags |= F_ALT; continue;
14259 case '0': arg->flags |= F_ZERO; continue;
14260 }
14261 break;
14262 }
14263
14264 /* Parse width. Example: "%10s" => width=10 */
14265 if (arg->ch == '*') {
14266 v = unicode_format_getnextarg(ctx);
14267 if (v == NULL)
14268 return -1;
14269 if (!PyLong_Check(v)) {
14270 PyErr_SetString(PyExc_TypeError,
14271 "* wants int");
14272 return -1;
14273 }
14274 arg->width = PyLong_AsSsize_t(v);
14275 if (arg->width == -1 && PyErr_Occurred())
14276 return -1;
14277 if (arg->width < 0) {
14278 arg->flags |= F_LJUST;
14279 arg->width = -arg->width;
14280 }
14281 if (--ctx->fmtcnt >= 0) {
14282 arg->ch = FORMAT_READ(ctx);
14283 ctx->fmtpos++;
14284 }
14285 }
14286 else if (arg->ch >= '0' && arg->ch <= '9') {
14287 arg->width = arg->ch - '0';
14288 while (--ctx->fmtcnt >= 0) {
14289 arg->ch = FORMAT_READ(ctx);
14290 ctx->fmtpos++;
14291 if (arg->ch < '0' || arg->ch > '9')
14292 break;
14293 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14294 mixing signed and unsigned comparison. Since arg->ch is between
14295 '0' and '9', casting to int is safe. */
14296 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14297 PyErr_SetString(PyExc_ValueError,
14298 "width too big");
14299 return -1;
14300 }
14301 arg->width = arg->width*10 + (arg->ch - '0');
14302 }
14303 }
14304
14305 /* Parse precision. Example: "%.3f" => prec=3 */
14306 if (arg->ch == '.') {
14307 arg->prec = 0;
14308 if (--ctx->fmtcnt >= 0) {
14309 arg->ch = FORMAT_READ(ctx);
14310 ctx->fmtpos++;
14311 }
14312 if (arg->ch == '*') {
14313 v = unicode_format_getnextarg(ctx);
14314 if (v == NULL)
14315 return -1;
14316 if (!PyLong_Check(v)) {
14317 PyErr_SetString(PyExc_TypeError,
14318 "* wants int");
14319 return -1;
14320 }
14321 arg->prec = PyLong_AsInt(v);
14322 if (arg->prec == -1 && PyErr_Occurred())
14323 return -1;
14324 if (arg->prec < 0)
14325 arg->prec = 0;
14326 if (--ctx->fmtcnt >= 0) {
14327 arg->ch = FORMAT_READ(ctx);
14328 ctx->fmtpos++;
14329 }
14330 }
14331 else if (arg->ch >= '0' && arg->ch <= '9') {
14332 arg->prec = arg->ch - '0';
14333 while (--ctx->fmtcnt >= 0) {
14334 arg->ch = FORMAT_READ(ctx);
14335 ctx->fmtpos++;
14336 if (arg->ch < '0' || arg->ch > '9')
14337 break;
14338 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14339 PyErr_SetString(PyExc_ValueError,
14340 "precision too big");
14341 return -1;
14342 }
14343 arg->prec = arg->prec*10 + (arg->ch - '0');
14344 }
14345 }
14346 }
14347
14348 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14349 if (ctx->fmtcnt >= 0) {
14350 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14351 if (--ctx->fmtcnt >= 0) {
14352 arg->ch = FORMAT_READ(ctx);
14353 ctx->fmtpos++;
14354 }
14355 }
14356 }
14357 if (ctx->fmtcnt < 0) {
14358 PyErr_SetString(PyExc_ValueError,
14359 "incomplete format");
14360 return -1;
14361 }
14362 return 0;
14363
14364 #undef FORMAT_READ
14365 }
14366
14367 /* Format one argument. Supported conversion specifiers:
14368
14369 - "s", "r", "a": any type
14370 - "i", "d", "u": int or float
14371 - "o", "x", "X": int
14372 - "e", "E", "f", "F", "g", "G": float
14373 - "c": int or str (1 character)
14374
14375 When possible, the output is written directly into the Unicode writer
14376 (ctx->writer). A string is created when padding is required.
14377
14378 Return 0 if the argument has been formatted into *p_str,
14379 1 if the argument has been written into ctx->writer,
14380 -1 on error. */
14381 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14382 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14383 struct unicode_format_arg_t *arg,
14384 PyObject **p_str)
14385 {
14386 PyObject *v;
14387 _PyUnicodeWriter *writer = &ctx->writer;
14388
14389 if (ctx->fmtcnt == 0)
14390 ctx->writer.overallocate = 0;
14391
14392 v = unicode_format_getnextarg(ctx);
14393 if (v == NULL)
14394 return -1;
14395
14396
14397 switch (arg->ch) {
14398 case 's':
14399 case 'r':
14400 case 'a':
14401 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14402 /* Fast path */
14403 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14404 return -1;
14405 return 1;
14406 }
14407
14408 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14409 *p_str = Py_NewRef(v);
14410 }
14411 else {
14412 if (arg->ch == 's')
14413 *p_str = PyObject_Str(v);
14414 else if (arg->ch == 'r')
14415 *p_str = PyObject_Repr(v);
14416 else
14417 *p_str = PyObject_ASCII(v);
14418 }
14419 break;
14420
14421 case 'i':
14422 case 'd':
14423 case 'u':
14424 case 'o':
14425 case 'x':
14426 case 'X':
14427 {
14428 int ret = mainformatlong(v, arg, p_str, writer);
14429 if (ret != 0)
14430 return ret;
14431 arg->sign = 1;
14432 break;
14433 }
14434
14435 case 'e':
14436 case 'E':
14437 case 'f':
14438 case 'F':
14439 case 'g':
14440 case 'G':
14441 if (arg->width == -1 && arg->prec == -1
14442 && !(arg->flags & (F_SIGN | F_BLANK)))
14443 {
14444 /* Fast path */
14445 if (formatfloat(v, arg, NULL, writer) == -1)
14446 return -1;
14447 return 1;
14448 }
14449
14450 arg->sign = 1;
14451 if (formatfloat(v, arg, p_str, NULL) == -1)
14452 return -1;
14453 break;
14454
14455 case 'c':
14456 {
14457 Py_UCS4 ch = formatchar(v);
14458 if (ch == (Py_UCS4) -1)
14459 return -1;
14460 if (arg->width == -1 && arg->prec == -1) {
14461 /* Fast path */
14462 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14463 return -1;
14464 return 1;
14465 }
14466 *p_str = PyUnicode_FromOrdinal(ch);
14467 break;
14468 }
14469
14470 default:
14471 PyErr_Format(PyExc_ValueError,
14472 "unsupported format character '%c' (0x%x) "
14473 "at index %zd",
14474 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14475 (int)arg->ch,
14476 ctx->fmtpos - 1);
14477 return -1;
14478 }
14479 if (*p_str == NULL)
14480 return -1;
14481 assert (PyUnicode_Check(*p_str));
14482 return 0;
14483 }
14484
14485 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14486 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14487 struct unicode_format_arg_t *arg,
14488 PyObject *str)
14489 {
14490 Py_ssize_t len;
14491 int kind;
14492 const void *pbuf;
14493 Py_ssize_t pindex;
14494 Py_UCS4 signchar;
14495 Py_ssize_t buflen;
14496 Py_UCS4 maxchar;
14497 Py_ssize_t sublen;
14498 _PyUnicodeWriter *writer = &ctx->writer;
14499 Py_UCS4 fill;
14500
14501 fill = ' ';
14502 if (arg->sign && arg->flags & F_ZERO)
14503 fill = '0';
14504
14505 len = PyUnicode_GET_LENGTH(str);
14506 if ((arg->width == -1 || arg->width <= len)
14507 && (arg->prec == -1 || arg->prec >= len)
14508 && !(arg->flags & (F_SIGN | F_BLANK)))
14509 {
14510 /* Fast path */
14511 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14512 return -1;
14513 return 0;
14514 }
14515
14516 /* Truncate the string for "s", "r" and "a" formats
14517 if the precision is set */
14518 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14519 if (arg->prec >= 0 && len > arg->prec)
14520 len = arg->prec;
14521 }
14522
14523 /* Adjust sign and width */
14524 kind = PyUnicode_KIND(str);
14525 pbuf = PyUnicode_DATA(str);
14526 pindex = 0;
14527 signchar = '\0';
14528 if (arg->sign) {
14529 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14530 if (ch == '-' || ch == '+') {
14531 signchar = ch;
14532 len--;
14533 pindex++;
14534 }
14535 else if (arg->flags & F_SIGN)
14536 signchar = '+';
14537 else if (arg->flags & F_BLANK)
14538 signchar = ' ';
14539 else
14540 arg->sign = 0;
14541 }
14542 if (arg->width < len)
14543 arg->width = len;
14544
14545 /* Prepare the writer */
14546 maxchar = writer->maxchar;
14547 if (!(arg->flags & F_LJUST)) {
14548 if (arg->sign) {
14549 if ((arg->width-1) > len)
14550 maxchar = Py_MAX(maxchar, fill);
14551 }
14552 else {
14553 if (arg->width > len)
14554 maxchar = Py_MAX(maxchar, fill);
14555 }
14556 }
14557 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14558 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14559 maxchar = Py_MAX(maxchar, strmaxchar);
14560 }
14561
14562 buflen = arg->width;
14563 if (arg->sign && len == arg->width)
14564 buflen++;
14565 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14566 return -1;
14567
14568 /* Write the sign if needed */
14569 if (arg->sign) {
14570 if (fill != ' ') {
14571 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14572 writer->pos += 1;
14573 }
14574 if (arg->width > len)
14575 arg->width--;
14576 }
14577
14578 /* Write the numeric prefix for "x", "X" and "o" formats
14579 if the alternate form is used.
14580 For example, write "0x" for the "%#x" format. */
14581 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14582 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14583 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14584 if (fill != ' ') {
14585 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14586 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14587 writer->pos += 2;
14588 pindex += 2;
14589 }
14590 arg->width -= 2;
14591 if (arg->width < 0)
14592 arg->width = 0;
14593 len -= 2;
14594 }
14595
14596 /* Pad left with the fill character if needed */
14597 if (arg->width > len && !(arg->flags & F_LJUST)) {
14598 sublen = arg->width - len;
14599 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14600 writer->pos += sublen;
14601 arg->width = len;
14602 }
14603
14604 /* If padding with spaces: write sign if needed and/or numeric prefix if
14605 the alternate form is used */
14606 if (fill == ' ') {
14607 if (arg->sign) {
14608 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14609 writer->pos += 1;
14610 }
14611 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14612 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14613 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14614 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14615 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14616 writer->pos += 2;
14617 pindex += 2;
14618 }
14619 }
14620
14621 /* Write characters */
14622 if (len) {
14623 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14624 str, pindex, len);
14625 writer->pos += len;
14626 }
14627
14628 /* Pad right with the fill character if needed */
14629 if (arg->width > len) {
14630 sublen = arg->width - len;
14631 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14632 writer->pos += sublen;
14633 }
14634 return 0;
14635 }
14636
14637 /* Helper of PyUnicode_Format(): format one arg.
14638 Return 0 on success, raise an exception and return -1 on error. */
14639 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14640 unicode_format_arg(struct unicode_formatter_t *ctx)
14641 {
14642 struct unicode_format_arg_t arg;
14643 PyObject *str;
14644 int ret;
14645
14646 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14647 if (arg.ch == '%') {
14648 ctx->fmtpos++;
14649 ctx->fmtcnt--;
14650 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14651 return -1;
14652 return 0;
14653 }
14654 arg.flags = 0;
14655 arg.width = -1;
14656 arg.prec = -1;
14657 arg.sign = 0;
14658 str = NULL;
14659
14660 ret = unicode_format_arg_parse(ctx, &arg);
14661 if (ret == -1)
14662 return -1;
14663
14664 ret = unicode_format_arg_format(ctx, &arg, &str);
14665 if (ret == -1)
14666 return -1;
14667
14668 if (ret != 1) {
14669 ret = unicode_format_arg_output(ctx, &arg, str);
14670 Py_DECREF(str);
14671 if (ret == -1)
14672 return -1;
14673 }
14674
14675 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14676 PyErr_SetString(PyExc_TypeError,
14677 "not all arguments converted during string formatting");
14678 return -1;
14679 }
14680 return 0;
14681 }
14682
14683 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14684 PyUnicode_Format(PyObject *format, PyObject *args)
14685 {
14686 struct unicode_formatter_t ctx;
14687
14688 if (format == NULL || args == NULL) {
14689 PyErr_BadInternalCall();
14690 return NULL;
14691 }
14692
14693 if (ensure_unicode(format) < 0)
14694 return NULL;
14695
14696 ctx.fmtstr = format;
14697 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14698 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14699 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14700 ctx.fmtpos = 0;
14701
14702 _PyUnicodeWriter_Init(&ctx.writer);
14703 ctx.writer.min_length = ctx.fmtcnt + 100;
14704 ctx.writer.overallocate = 1;
14705
14706 if (PyTuple_Check(args)) {
14707 ctx.arglen = PyTuple_Size(args);
14708 ctx.argidx = 0;
14709 }
14710 else {
14711 ctx.arglen = -1;
14712 ctx.argidx = -2;
14713 }
14714 ctx.args_owned = 0;
14715 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14716 ctx.dict = args;
14717 else
14718 ctx.dict = NULL;
14719 ctx.args = args;
14720
14721 while (--ctx.fmtcnt >= 0) {
14722 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14723 Py_ssize_t nonfmtpos;
14724
14725 nonfmtpos = ctx.fmtpos++;
14726 while (ctx.fmtcnt >= 0 &&
14727 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14728 ctx.fmtpos++;
14729 ctx.fmtcnt--;
14730 }
14731 if (ctx.fmtcnt < 0) {
14732 ctx.fmtpos--;
14733 ctx.writer.overallocate = 0;
14734 }
14735
14736 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14737 nonfmtpos, ctx.fmtpos) < 0)
14738 goto onError;
14739 }
14740 else {
14741 ctx.fmtpos++;
14742 if (unicode_format_arg(&ctx) == -1)
14743 goto onError;
14744 }
14745 }
14746
14747 if (ctx.argidx < ctx.arglen && !ctx.dict) {
14748 PyErr_SetString(PyExc_TypeError,
14749 "not all arguments converted during string formatting");
14750 goto onError;
14751 }
14752
14753 if (ctx.args_owned) {
14754 Py_DECREF(ctx.args);
14755 }
14756 return _PyUnicodeWriter_Finish(&ctx.writer);
14757
14758 onError:
14759 _PyUnicodeWriter_Dealloc(&ctx.writer);
14760 if (ctx.args_owned) {
14761 Py_DECREF(ctx.args);
14762 }
14763 return NULL;
14764 }
14765
14766 static PyObject *
14767 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
14768
14769 /*[clinic input]
14770 @classmethod
14771 str.__new__ as unicode_new
14772
14773 object as x: object = NULL
14774 encoding: str = NULL
14775 errors: str = NULL
14776
14777 [clinic start generated code]*/
14778
14779 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)14780 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
14781 const char *errors)
14782 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
14783 {
14784 PyObject *unicode;
14785 if (x == NULL) {
14786 unicode = unicode_get_empty();
14787 }
14788 else if (encoding == NULL && errors == NULL) {
14789 unicode = PyObject_Str(x);
14790 }
14791 else {
14792 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
14793 }
14794
14795 if (unicode != NULL && type != &PyUnicode_Type) {
14796 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
14797 }
14798 return unicode;
14799 }
14800
14801 static const char *
arg_as_utf8(PyObject * obj,const char * name)14802 arg_as_utf8(PyObject *obj, const char *name)
14803 {
14804 if (!PyUnicode_Check(obj)) {
14805 PyErr_Format(PyExc_TypeError,
14806 "str() argument '%s' must be str, not %T",
14807 name, obj);
14808 return NULL;
14809 }
14810 return _PyUnicode_AsUTF8NoNUL(obj);
14811 }
14812
14813 static PyObject *
unicode_vectorcall(PyObject * type,PyObject * const * args,size_t nargsf,PyObject * kwnames)14814 unicode_vectorcall(PyObject *type, PyObject *const *args,
14815 size_t nargsf, PyObject *kwnames)
14816 {
14817 assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
14818
14819 Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
14820 if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
14821 // Fallback to unicode_new()
14822 PyObject *tuple = _PyTuple_FromArray(args, nargs);
14823 if (tuple == NULL) {
14824 return NULL;
14825 }
14826 PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
14827 if (dict == NULL) {
14828 Py_DECREF(tuple);
14829 return NULL;
14830 }
14831 PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
14832 Py_DECREF(tuple);
14833 Py_DECREF(dict);
14834 return ret;
14835 }
14836 if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
14837 return NULL;
14838 }
14839 if (nargs == 0) {
14840 return unicode_get_empty();
14841 }
14842 PyObject *object = args[0];
14843 if (nargs == 1) {
14844 return PyObject_Str(object);
14845 }
14846 const char *encoding = arg_as_utf8(args[1], "encoding");
14847 if (encoding == NULL) {
14848 return NULL;
14849 }
14850 const char *errors = NULL;
14851 if (nargs == 3) {
14852 errors = arg_as_utf8(args[2], "errors");
14853 if (errors == NULL) {
14854 return NULL;
14855 }
14856 }
14857 return PyUnicode_FromEncodedObject(object, encoding, errors);
14858 }
14859
14860 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)14861 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
14862 {
14863 PyObject *self;
14864 Py_ssize_t length, char_size;
14865 int share_utf8;
14866 int kind;
14867 void *data;
14868
14869 assert(PyType_IsSubtype(type, &PyUnicode_Type));
14870 assert(_PyUnicode_CHECK(unicode));
14871
14872 self = type->tp_alloc(type, 0);
14873 if (self == NULL) {
14874 return NULL;
14875 }
14876 kind = PyUnicode_KIND(unicode);
14877 length = PyUnicode_GET_LENGTH(unicode);
14878
14879 _PyUnicode_LENGTH(self) = length;
14880 #ifdef Py_DEBUG
14881 _PyUnicode_HASH(self) = -1;
14882 #else
14883 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14884 #endif
14885 _PyUnicode_STATE(self).interned = 0;
14886 _PyUnicode_STATE(self).kind = kind;
14887 _PyUnicode_STATE(self).compact = 0;
14888 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14889 _PyUnicode_STATE(self).statically_allocated = 0;
14890 _PyUnicode_UTF8_LENGTH(self) = 0;
14891 _PyUnicode_UTF8(self) = NULL;
14892 _PyUnicode_DATA_ANY(self) = NULL;
14893
14894 share_utf8 = 0;
14895 if (kind == PyUnicode_1BYTE_KIND) {
14896 char_size = 1;
14897 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14898 share_utf8 = 1;
14899 }
14900 else if (kind == PyUnicode_2BYTE_KIND) {
14901 char_size = 2;
14902 }
14903 else {
14904 assert(kind == PyUnicode_4BYTE_KIND);
14905 char_size = 4;
14906 }
14907
14908 /* Ensure we won't overflow the length. */
14909 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14910 PyErr_NoMemory();
14911 goto onError;
14912 }
14913 data = PyMem_Malloc((length + 1) * char_size);
14914 if (data == NULL) {
14915 PyErr_NoMemory();
14916 goto onError;
14917 }
14918
14919 _PyUnicode_DATA_ANY(self) = data;
14920 if (share_utf8) {
14921 _PyUnicode_UTF8_LENGTH(self) = length;
14922 _PyUnicode_UTF8(self) = data;
14923 }
14924
14925 memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
14926 assert(_PyUnicode_CheckConsistency(self, 1));
14927 #ifdef Py_DEBUG
14928 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14929 #endif
14930 return self;
14931
14932 onError:
14933 Py_DECREF(self);
14934 return NULL;
14935 }
14936
14937 void
_PyUnicode_ExactDealloc(PyObject * op)14938 _PyUnicode_ExactDealloc(PyObject *op)
14939 {
14940 assert(PyUnicode_CheckExact(op));
14941 unicode_dealloc(op);
14942 }
14943
14944 PyDoc_STRVAR(unicode_doc,
14945 "str(object='') -> str\n\
14946 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14947 \n\
14948 Create a new string object from the given object. If encoding or\n\
14949 errors is specified, then the object must expose a data buffer\n\
14950 that will be decoded using the given encoding and error handler.\n\
14951 Otherwise, returns the result of object.__str__() (if defined)\n\
14952 or repr(object).\n\
14953 encoding defaults to 'utf-8'.\n\
14954 errors defaults to 'strict'.");
14955
14956 static PyObject *unicode_iter(PyObject *seq);
14957
14958 PyTypeObject PyUnicode_Type = {
14959 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14960 "str", /* tp_name */
14961 sizeof(PyUnicodeObject), /* tp_basicsize */
14962 0, /* tp_itemsize */
14963 /* Slots */
14964 (destructor)unicode_dealloc, /* tp_dealloc */
14965 0, /* tp_vectorcall_offset */
14966 0, /* tp_getattr */
14967 0, /* tp_setattr */
14968 0, /* tp_as_async */
14969 unicode_repr, /* tp_repr */
14970 &unicode_as_number, /* tp_as_number */
14971 &unicode_as_sequence, /* tp_as_sequence */
14972 &unicode_as_mapping, /* tp_as_mapping */
14973 (hashfunc) unicode_hash, /* tp_hash*/
14974 0, /* tp_call*/
14975 (reprfunc) unicode_str, /* tp_str */
14976 PyObject_GenericGetAttr, /* tp_getattro */
14977 0, /* tp_setattro */
14978 0, /* tp_as_buffer */
14979 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14980 Py_TPFLAGS_UNICODE_SUBCLASS |
14981 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14982 unicode_doc, /* tp_doc */
14983 0, /* tp_traverse */
14984 0, /* tp_clear */
14985 PyUnicode_RichCompare, /* tp_richcompare */
14986 0, /* tp_weaklistoffset */
14987 unicode_iter, /* tp_iter */
14988 0, /* tp_iternext */
14989 unicode_methods, /* tp_methods */
14990 0, /* tp_members */
14991 0, /* tp_getset */
14992 0, /* tp_base */
14993 0, /* tp_dict */
14994 0, /* tp_descr_get */
14995 0, /* tp_descr_set */
14996 0, /* tp_dictoffset */
14997 0, /* tp_init */
14998 0, /* tp_alloc */
14999 unicode_new, /* tp_new */
15000 PyObject_Del, /* tp_free */
15001 .tp_vectorcall = unicode_vectorcall,
15002 };
15003
15004 /* Initialize the Unicode implementation */
15005
15006 static void
_init_global_state(void)15007 _init_global_state(void)
15008 {
15009 static int initialized = 0;
15010 if (initialized) {
15011 return;
15012 }
15013 initialized = 1;
15014
15015 /* initialize the linebreak bloom filter */
15016 const Py_UCS2 linebreak[] = {
15017 0x000A, /* LINE FEED */
15018 0x000D, /* CARRIAGE RETURN */
15019 0x001C, /* FILE SEPARATOR */
15020 0x001D, /* GROUP SEPARATOR */
15021 0x001E, /* RECORD SEPARATOR */
15022 0x0085, /* NEXT LINE */
15023 0x2028, /* LINE SEPARATOR */
15024 0x2029, /* PARAGRAPH SEPARATOR */
15025 };
15026 bloom_linebreak = make_bloom_mask(
15027 PyUnicode_2BYTE_KIND, linebreak,
15028 Py_ARRAY_LENGTH(linebreak));
15029 }
15030
15031 void
_PyUnicode_InitState(PyInterpreterState * interp)15032 _PyUnicode_InitState(PyInterpreterState *interp)
15033 {
15034 if (!_Py_IsMainInterpreter(interp)) {
15035 return;
15036 }
15037 _init_global_state();
15038 }
15039
15040
15041 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15042 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15043 {
15044 if (_Py_IsMainInterpreter(interp)) {
15045 PyStatus status = init_global_interned_strings(interp);
15046 if (_PyStatus_EXCEPTION(status)) {
15047 return status;
15048 }
15049 }
15050 assert(INTERNED_STRINGS);
15051
15052 if (init_interned_dict(interp)) {
15053 PyErr_Clear();
15054 return _PyStatus_ERR("failed to create interned dict");
15055 }
15056
15057 return _PyStatus_OK();
15058 }
15059
15060
15061 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15062 _PyUnicode_InitTypes(PyInterpreterState *interp)
15063 {
15064 if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15065 goto error;
15066 }
15067 if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15068 goto error;
15069 }
15070 if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15071 goto error;
15072 }
15073 return _PyStatus_OK();
15074
15075 error:
15076 return _PyStatus_ERR("Can't initialize unicode types");
15077 }
15078
15079 static /* non-null */ PyObject*
intern_static(PyInterpreterState * interp,PyObject * s)15080 intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15081 {
15082 // Note that this steals a reference to `s`, but in many cases that
15083 // stolen ref is returned, requiring no decref/incref.
15084
15085 assert(s != NULL);
15086 assert(_PyUnicode_CHECK(s));
15087 assert(_PyUnicode_STATE(s).statically_allocated);
15088 assert(!PyUnicode_CHECK_INTERNED(s));
15089
15090 #ifdef Py_DEBUG
15091 /* We must not add process-global interned string if there's already a
15092 * per-interpreter interned_dict, which might contain duplicates.
15093 */
15094 PyObject *interned = get_interned_dict(interp);
15095 assert(interned == NULL);
15096 #endif
15097
15098 /* Look in the global cache first. */
15099 PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15100 /* We should only init each string once */
15101 assert(r == NULL);
15102 /* but just in case (for the non-debug build), handle this */
15103 if (r != NULL && r != s) {
15104 assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15105 assert(_PyUnicode_CHECK(r));
15106 Py_DECREF(s);
15107 return Py_NewRef(r);
15108 }
15109
15110 if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15111 Py_FatalError("failed to intern static string");
15112 }
15113
15114 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15115 return s;
15116 }
15117
15118 void
_PyUnicode_InternStatic(PyInterpreterState * interp,PyObject ** p)15119 _PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15120 {
15121 // This should only be called as part of runtime initialization
15122 assert(!Py_IsInitialized());
15123
15124 *p = intern_static(interp, *p);
15125 assert(*p);
15126 }
15127
15128 static void
immortalize_interned(PyObject * s)15129 immortalize_interned(PyObject *s)
15130 {
15131 assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15132 assert(!_Py_IsImmortal(s));
15133 #ifdef Py_REF_DEBUG
15134 /* The reference count value should be excluded from the RefTotal.
15135 The decrements to these objects will not be registered so they
15136 need to be accounted for in here. */
15137 for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15138 _Py_DecRefTotal(_PyThreadState_GET());
15139 }
15140 #endif
15141 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL;
15142 _Py_SetImmortal(s);
15143 }
15144
15145 static /* non-null */ PyObject*
intern_common(PyInterpreterState * interp,PyObject * s,bool immortalize)15146 intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15147 bool immortalize)
15148 {
15149 // Note that this steals a reference to `s`, but in many cases that
15150 // stolen ref is returned, requiring no decref/incref.
15151
15152 #ifdef Py_DEBUG
15153 assert(s != NULL);
15154 assert(_PyUnicode_CHECK(s));
15155 #else
15156 if (s == NULL || !PyUnicode_Check(s)) {
15157 return s;
15158 }
15159 #endif
15160
15161 /* If it's a subclass, we don't really know what putting
15162 it in the interned dict might do. */
15163 if (!PyUnicode_CheckExact(s)) {
15164 return s;
15165 }
15166
15167 /* Is it already interned? */
15168 switch (PyUnicode_CHECK_INTERNED(s)) {
15169 case SSTATE_NOT_INTERNED:
15170 // no, go on
15171 break;
15172 case SSTATE_INTERNED_MORTAL:
15173 // yes but we might need to make it immortal
15174 if (immortalize) {
15175 immortalize_interned(s);
15176 }
15177 return s;
15178 default:
15179 // all done
15180 return s;
15181 }
15182
15183 /* Statically allocated strings must be already interned. */
15184 assert(!_PyUnicode_STATE(s).statically_allocated);
15185
15186 #if Py_GIL_DISABLED
15187 /* In the free-threaded build, all interned strings are immortal */
15188 immortalize = 1;
15189 #endif
15190
15191 /* If it's already immortal, intern it as such */
15192 if (_Py_IsImmortal(s)) {
15193 immortalize = 1;
15194 }
15195
15196 /* if it's a short string, get the singleton */
15197 if (PyUnicode_GET_LENGTH(s) == 1 &&
15198 PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15199 PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15200 assert(PyUnicode_CHECK_INTERNED(r));
15201 Py_DECREF(s);
15202 return r;
15203 }
15204 #ifdef Py_DEBUG
15205 assert(!unicode_is_singleton(s));
15206 #endif
15207
15208 /* Look in the global cache now. */
15209 {
15210 PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15211 if (r != NULL) {
15212 assert(_PyUnicode_STATE(r).statically_allocated);
15213 assert(r != s); // r must be statically_allocated; s is not
15214 Py_DECREF(s);
15215 return Py_NewRef(r);
15216 }
15217 }
15218
15219 /* Do a setdefault on the per-interpreter cache. */
15220 PyObject *interned = get_interned_dict(interp);
15221 assert(interned != NULL);
15222
15223 PyObject *t;
15224 {
15225 int res = PyDict_SetDefaultRef(interned, s, s, &t);
15226 if (res < 0) {
15227 PyErr_Clear();
15228 return s;
15229 }
15230 else if (res == 1) {
15231 // value was already present (not inserted)
15232 Py_DECREF(s);
15233 if (immortalize &&
15234 PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
15235 immortalize_interned(t);
15236 }
15237 return t;
15238 }
15239 else {
15240 // value was newly inserted
15241 assert (s == t);
15242 Py_DECREF(t);
15243 }
15244 }
15245
15246 /* NOT_INTERNED -> INTERNED_MORTAL */
15247
15248 assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
15249
15250 if (!_Py_IsImmortal(s)) {
15251 /* The two references in interned dict (key and value) are not counted.
15252 unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
15253 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15254 #ifdef Py_REF_DEBUG
15255 /* let's be pedantic with the ref total */
15256 _Py_DecRefTotal(_PyThreadState_GET());
15257 _Py_DecRefTotal(_PyThreadState_GET());
15258 #endif
15259 }
15260 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15261
15262 /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
15263
15264 #ifdef Py_DEBUG
15265 if (_Py_IsImmortal(s)) {
15266 assert(immortalize);
15267 }
15268 #endif
15269 if (immortalize) {
15270 immortalize_interned(s);
15271 }
15272
15273 return s;
15274 }
15275
15276 void
_PyUnicode_InternImmortal(PyInterpreterState * interp,PyObject ** p)15277 _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
15278 {
15279 *p = intern_common(interp, *p, 1);
15280 assert(*p);
15281 }
15282
15283 void
_PyUnicode_InternMortal(PyInterpreterState * interp,PyObject ** p)15284 _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
15285 {
15286 *p = intern_common(interp, *p, 0);
15287 assert(*p);
15288 }
15289
15290
15291 void
_PyUnicode_InternInPlace(PyInterpreterState * interp,PyObject ** p)15292 _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
15293 {
15294 _PyUnicode_InternImmortal(interp, p);
15295 return;
15296 }
15297
15298 void
PyUnicode_InternInPlace(PyObject ** p)15299 PyUnicode_InternInPlace(PyObject **p)
15300 {
15301 PyInterpreterState *interp = _PyInterpreterState_GET();
15302 _PyUnicode_InternMortal(interp, p);
15303 }
15304
15305 // Public-looking name kept for the stable ABI; user should not call this:
15306 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
15307 void
PyUnicode_InternImmortal(PyObject ** p)15308 PyUnicode_InternImmortal(PyObject **p)
15309 {
15310 PyInterpreterState *interp = _PyInterpreterState_GET();
15311 _PyUnicode_InternImmortal(interp, p);
15312 }
15313
15314 PyObject *
PyUnicode_InternFromString(const char * cp)15315 PyUnicode_InternFromString(const char *cp)
15316 {
15317 PyObject *s = PyUnicode_FromString(cp);
15318 if (s == NULL) {
15319 return NULL;
15320 }
15321 PyInterpreterState *interp = _PyInterpreterState_GET();
15322 _PyUnicode_InternMortal(interp, &s);
15323 return s;
15324 }
15325
15326
15327 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15328 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15329 {
15330 PyObject *interned = get_interned_dict(interp);
15331 if (interned == NULL) {
15332 return;
15333 }
15334 assert(PyDict_CheckExact(interned));
15335
15336 if (has_shared_intern_dict(interp)) {
15337 // the dict doesn't belong to this interpreter, skip the debug
15338 // checks on it and just clear the pointer to it
15339 clear_interned_dict(interp);
15340 return;
15341 }
15342
15343 #ifdef INTERNED_STATS
15344 fprintf(stderr, "releasing %zd interned strings\n",
15345 PyDict_GET_SIZE(interned));
15346
15347 Py_ssize_t total_length = 0;
15348 #endif
15349 Py_ssize_t pos = 0;
15350 PyObject *s, *ignored_value;
15351 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15352 assert(PyUnicode_IS_READY(s));
15353 int shared = 0;
15354 switch (PyUnicode_CHECK_INTERNED(s)) {
15355 case SSTATE_INTERNED_IMMORTAL:
15356 /* Make immortal interned strings mortal again.
15357 *
15358 * Currently, the runtime is not able to guarantee that it can exit
15359 * without allocations that carry over to a future initialization
15360 * of Python within the same process. i.e:
15361 * ./python -X showrefcount -c 'import itertools'
15362 * [237 refs, 237 blocks]
15363 *
15364 * This should remain disabled (`Py_DEBUG` only) until there is a
15365 * strict guarantee that no memory will be left after
15366 * `Py_Finalize`.
15367 */
15368 #ifdef Py_DEBUG
15369 // Skip the Immortal Instance check and restore
15370 // the two references (key and value) ignored
15371 // by PyUnicode_InternInPlace().
15372 _Py_SetMortal(s, 2);
15373 #ifdef Py_REF_DEBUG
15374 /* let's be pedantic with the ref total */
15375 _Py_IncRefTotal(_PyThreadState_GET());
15376 _Py_IncRefTotal(_PyThreadState_GET());
15377 #endif
15378 #ifdef INTERNED_STATS
15379 total_length += PyUnicode_GET_LENGTH(s);
15380 #endif
15381 #endif // Py_DEBUG
15382 break;
15383 case SSTATE_INTERNED_IMMORTAL_STATIC:
15384 /* It is shared between interpreters, so we should unmark it
15385 only when this is the last interpreter in which it's
15386 interned. We immortalize all the statically initialized
15387 strings during startup, so we can rely on the
15388 main interpreter to be the last one. */
15389 if (!_Py_IsMainInterpreter(interp)) {
15390 shared = 1;
15391 }
15392 break;
15393 case SSTATE_INTERNED_MORTAL:
15394 // Restore 2 references held by the interned dict; these will
15395 // be decref'd by clear_interned_dict's PyDict_Clear.
15396 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15397 #ifdef Py_REF_DEBUG
15398 /* let's be pedantic with the ref total */
15399 _Py_IncRefTotal(_PyThreadState_GET());
15400 _Py_IncRefTotal(_PyThreadState_GET());
15401 #endif
15402 break;
15403 case SSTATE_NOT_INTERNED:
15404 /* fall through */
15405 default:
15406 Py_UNREACHABLE();
15407 }
15408 if (!shared) {
15409 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15410 }
15411 }
15412 #ifdef INTERNED_STATS
15413 fprintf(stderr,
15414 "total length of all interned strings: %zd characters\n",
15415 total_length);
15416 #endif
15417
15418 struct _Py_unicode_state *state = &interp->unicode;
15419 struct _Py_unicode_ids *ids = &state->ids;
15420 for (Py_ssize_t i=0; i < ids->size; i++) {
15421 Py_XINCREF(ids->array[i]);
15422 }
15423 clear_interned_dict(interp);
15424 if (_Py_IsMainInterpreter(interp)) {
15425 clear_global_interned_strings();
15426 }
15427 }
15428
15429
15430 /********************* Unicode Iterator **************************/
15431
15432 typedef struct {
15433 PyObject_HEAD
15434 Py_ssize_t it_index;
15435 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15436 } unicodeiterobject;
15437
15438 static void
unicodeiter_dealloc(unicodeiterobject * it)15439 unicodeiter_dealloc(unicodeiterobject *it)
15440 {
15441 _PyObject_GC_UNTRACK(it);
15442 Py_XDECREF(it->it_seq);
15443 PyObject_GC_Del(it);
15444 }
15445
15446 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15447 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15448 {
15449 Py_VISIT(it->it_seq);
15450 return 0;
15451 }
15452
15453 static PyObject *
unicodeiter_next(unicodeiterobject * it)15454 unicodeiter_next(unicodeiterobject *it)
15455 {
15456 PyObject *seq;
15457
15458 assert(it != NULL);
15459 seq = it->it_seq;
15460 if (seq == NULL)
15461 return NULL;
15462 assert(_PyUnicode_CHECK(seq));
15463
15464 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15465 int kind = PyUnicode_KIND(seq);
15466 const void *data = PyUnicode_DATA(seq);
15467 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15468 it->it_index++;
15469 return unicode_char(chr);
15470 }
15471
15472 it->it_seq = NULL;
15473 Py_DECREF(seq);
15474 return NULL;
15475 }
15476
15477 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15478 unicode_ascii_iter_next(unicodeiterobject *it)
15479 {
15480 assert(it != NULL);
15481 PyObject *seq = it->it_seq;
15482 if (seq == NULL) {
15483 return NULL;
15484 }
15485 assert(_PyUnicode_CHECK(seq));
15486 assert(PyUnicode_IS_COMPACT_ASCII(seq));
15487 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15488 const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15489 Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15490 data, it->it_index);
15491 it->it_index++;
15492 return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15493 }
15494 it->it_seq = NULL;
15495 Py_DECREF(seq);
15496 return NULL;
15497 }
15498
15499 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15500 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15501 {
15502 Py_ssize_t len = 0;
15503 if (it->it_seq)
15504 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15505 return PyLong_FromSsize_t(len);
15506 }
15507
15508 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15509
15510 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15511 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15512 {
15513 PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15514
15515 /* _PyEval_GetBuiltin can invoke arbitrary code,
15516 * call must be before access of iterator pointers.
15517 * see issue #101765 */
15518
15519 if (it->it_seq != NULL) {
15520 return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15521 } else {
15522 PyObject *u = unicode_get_empty();
15523 if (u == NULL) {
15524 Py_XDECREF(iter);
15525 return NULL;
15526 }
15527 return Py_BuildValue("N(N)", iter, u);
15528 }
15529 }
15530
15531 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15532
15533 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15534 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15535 {
15536 Py_ssize_t index = PyLong_AsSsize_t(state);
15537 if (index == -1 && PyErr_Occurred())
15538 return NULL;
15539 if (it->it_seq != NULL) {
15540 if (index < 0)
15541 index = 0;
15542 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15543 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15544 it->it_index = index;
15545 }
15546 Py_RETURN_NONE;
15547 }
15548
15549 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15550
15551 static PyMethodDef unicodeiter_methods[] = {
15552 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15553 length_hint_doc},
15554 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15555 reduce_doc},
15556 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15557 setstate_doc},
15558 {NULL, NULL} /* sentinel */
15559 };
15560
15561 PyTypeObject PyUnicodeIter_Type = {
15562 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15563 "str_iterator", /* tp_name */
15564 sizeof(unicodeiterobject), /* tp_basicsize */
15565 0, /* tp_itemsize */
15566 /* methods */
15567 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15568 0, /* tp_vectorcall_offset */
15569 0, /* tp_getattr */
15570 0, /* tp_setattr */
15571 0, /* tp_as_async */
15572 0, /* tp_repr */
15573 0, /* tp_as_number */
15574 0, /* tp_as_sequence */
15575 0, /* tp_as_mapping */
15576 0, /* tp_hash */
15577 0, /* tp_call */
15578 0, /* tp_str */
15579 PyObject_GenericGetAttr, /* tp_getattro */
15580 0, /* tp_setattro */
15581 0, /* tp_as_buffer */
15582 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15583 0, /* tp_doc */
15584 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15585 0, /* tp_clear */
15586 0, /* tp_richcompare */
15587 0, /* tp_weaklistoffset */
15588 PyObject_SelfIter, /* tp_iter */
15589 (iternextfunc)unicodeiter_next, /* tp_iternext */
15590 unicodeiter_methods, /* tp_methods */
15591 0,
15592 };
15593
15594 PyTypeObject _PyUnicodeASCIIIter_Type = {
15595 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15596 .tp_name = "str_ascii_iterator",
15597 .tp_basicsize = sizeof(unicodeiterobject),
15598 .tp_dealloc = (destructor)unicodeiter_dealloc,
15599 .tp_getattro = PyObject_GenericGetAttr,
15600 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15601 .tp_traverse = (traverseproc)unicodeiter_traverse,
15602 .tp_iter = PyObject_SelfIter,
15603 .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15604 .tp_methods = unicodeiter_methods,
15605 };
15606
15607 static PyObject *
unicode_iter(PyObject * seq)15608 unicode_iter(PyObject *seq)
15609 {
15610 unicodeiterobject *it;
15611
15612 if (!PyUnicode_Check(seq)) {
15613 PyErr_BadInternalCall();
15614 return NULL;
15615 }
15616 if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15617 it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15618 }
15619 else {
15620 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15621 }
15622 if (it == NULL)
15623 return NULL;
15624 it->it_index = 0;
15625 it->it_seq = Py_NewRef(seq);
15626 _PyObject_GC_TRACK(it);
15627 return (PyObject *)it;
15628 }
15629
15630 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15631 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15632 {
15633 int res;
15634 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15635 if (res == -2) {
15636 PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
15637 return -1;
15638 }
15639 if (res < 0) {
15640 PyErr_NoMemory();
15641 return -1;
15642 }
15643 return 0;
15644 }
15645
15646
15647 static int
config_get_codec_name(wchar_t ** config_encoding)15648 config_get_codec_name(wchar_t **config_encoding)
15649 {
15650 char *encoding;
15651 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15652 return -1;
15653 }
15654
15655 PyObject *name_obj = NULL;
15656 PyObject *codec = _PyCodec_Lookup(encoding);
15657 PyMem_RawFree(encoding);
15658
15659 if (!codec)
15660 goto error;
15661
15662 name_obj = PyObject_GetAttrString(codec, "name");
15663 Py_CLEAR(codec);
15664 if (!name_obj) {
15665 goto error;
15666 }
15667
15668 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15669 Py_DECREF(name_obj);
15670 if (wname == NULL) {
15671 goto error;
15672 }
15673
15674 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15675 if (raw_wname == NULL) {
15676 PyMem_Free(wname);
15677 PyErr_NoMemory();
15678 goto error;
15679 }
15680
15681 PyMem_RawFree(*config_encoding);
15682 *config_encoding = raw_wname;
15683
15684 PyMem_Free(wname);
15685 return 0;
15686
15687 error:
15688 Py_XDECREF(codec);
15689 Py_XDECREF(name_obj);
15690 return -1;
15691 }
15692
15693
15694 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15695 init_stdio_encoding(PyInterpreterState *interp)
15696 {
15697 /* Update the stdio encoding to the normalized Python codec name. */
15698 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15699 if (config_get_codec_name(&config->stdio_encoding) < 0) {
15700 return _PyStatus_ERR("failed to get the Python codec name "
15701 "of the stdio encoding");
15702 }
15703 return _PyStatus_OK();
15704 }
15705
15706
15707 static int
init_fs_codec(PyInterpreterState * interp)15708 init_fs_codec(PyInterpreterState *interp)
15709 {
15710 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15711
15712 _Py_error_handler error_handler;
15713 error_handler = get_error_handler_wide(config->filesystem_errors);
15714 if (error_handler == _Py_ERROR_UNKNOWN) {
15715 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15716 return -1;
15717 }
15718
15719 char *encoding, *errors;
15720 if (encode_wstr_utf8(config->filesystem_encoding,
15721 &encoding,
15722 "filesystem_encoding") < 0) {
15723 return -1;
15724 }
15725
15726 if (encode_wstr_utf8(config->filesystem_errors,
15727 &errors,
15728 "filesystem_errors") < 0) {
15729 PyMem_RawFree(encoding);
15730 return -1;
15731 }
15732
15733 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15734 PyMem_RawFree(fs_codec->encoding);
15735 fs_codec->encoding = encoding;
15736 /* encoding has been normalized by init_fs_encoding() */
15737 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15738 PyMem_RawFree(fs_codec->errors);
15739 fs_codec->errors = errors;
15740 fs_codec->error_handler = error_handler;
15741
15742 #ifdef _Py_FORCE_UTF8_FS_ENCODING
15743 assert(fs_codec->utf8 == 1);
15744 #endif
15745
15746 /* At this point, PyUnicode_EncodeFSDefault() and
15747 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15748 the C implementation of the filesystem encoding. */
15749
15750 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15751 global configuration variables. */
15752 if (_Py_IsMainInterpreter(interp)) {
15753
15754 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
15755 fs_codec->errors) < 0) {
15756 PyErr_NoMemory();
15757 return -1;
15758 }
15759 }
15760 return 0;
15761 }
15762
15763
15764 static PyStatus
init_fs_encoding(PyThreadState * tstate)15765 init_fs_encoding(PyThreadState *tstate)
15766 {
15767 PyInterpreterState *interp = tstate->interp;
15768
15769 /* Update the filesystem encoding to the normalized Python codec name.
15770 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15771 (Python codec name). */
15772 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15773 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15774 _Py_DumpPathConfig(tstate);
15775 return _PyStatus_ERR("failed to get the Python codec "
15776 "of the filesystem encoding");
15777 }
15778
15779 if (init_fs_codec(interp) < 0) {
15780 return _PyStatus_ERR("cannot initialize filesystem codec");
15781 }
15782 return _PyStatus_OK();
15783 }
15784
15785
15786 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15787 _PyUnicode_InitEncodings(PyThreadState *tstate)
15788 {
15789 PyStatus status = _PyCodec_InitRegistry(tstate->interp);
15790 if (_PyStatus_EXCEPTION(status)) {
15791 return status;
15792 }
15793 status = init_fs_encoding(tstate);
15794 if (_PyStatus_EXCEPTION(status)) {
15795 return status;
15796 }
15797
15798 return init_stdio_encoding(tstate->interp);
15799 }
15800
15801
15802 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)15803 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
15804 {
15805 PyMem_RawFree(fs_codec->encoding);
15806 fs_codec->encoding = NULL;
15807 fs_codec->utf8 = 0;
15808 PyMem_RawFree(fs_codec->errors);
15809 fs_codec->errors = NULL;
15810 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
15811 }
15812
15813
15814 #ifdef MS_WINDOWS
15815 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15816 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15817 {
15818 PyInterpreterState *interp = _PyInterpreterState_GET();
15819 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
15820
15821 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15822 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15823 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15824 if (encoding == NULL || errors == NULL) {
15825 PyMem_RawFree(encoding);
15826 PyMem_RawFree(errors);
15827 PyErr_NoMemory();
15828 return -1;
15829 }
15830
15831 PyMem_RawFree(config->filesystem_encoding);
15832 config->filesystem_encoding = encoding;
15833 PyMem_RawFree(config->filesystem_errors);
15834 config->filesystem_errors = errors;
15835
15836 return init_fs_codec(interp);
15837 }
15838 #endif
15839
15840
15841 #ifdef Py_DEBUG
15842 static inline int
unicode_is_finalizing(void)15843 unicode_is_finalizing(void)
15844 {
15845 return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
15846 }
15847 #endif
15848
15849
15850 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)15851 _PyUnicode_FiniTypes(PyInterpreterState *interp)
15852 {
15853 _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
15854 _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
15855 _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
15856 }
15857
15858
15859 void
_PyUnicode_Fini(PyInterpreterState * interp)15860 _PyUnicode_Fini(PyInterpreterState *interp)
15861 {
15862 struct _Py_unicode_state *state = &interp->unicode;
15863
15864 if (!has_shared_intern_dict(interp)) {
15865 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
15866 assert(get_interned_dict(interp) == NULL);
15867 }
15868
15869 _PyUnicode_FiniEncodings(&state->fs_codec);
15870
15871 // bpo-47182: force a unicodedata CAPI capsule re-import on
15872 // subsequent initialization of interpreter.
15873 interp->unicode.ucnhash_capi = NULL;
15874
15875 unicode_clear_identifiers(state);
15876 }
15877
15878 /* A _string module, to export formatter_parser and formatter_field_name_split
15879 to the string.Formatter class implemented in Python. */
15880
15881 static PyMethodDef _string_methods[] = {
15882 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15883 METH_O, PyDoc_STR("split the argument as a field name")},
15884 {"formatter_parser", (PyCFunction) formatter_parser,
15885 METH_O, PyDoc_STR("parse the argument as a format string")},
15886 {NULL, NULL}
15887 };
15888
15889 static PyModuleDef_Slot module_slots[] = {
15890 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
15891 {Py_mod_gil, Py_MOD_GIL_NOT_USED},
15892 {0, NULL}
15893 };
15894
15895 static struct PyModuleDef _string_module = {
15896 PyModuleDef_HEAD_INIT,
15897 .m_name = "_string",
15898 .m_doc = PyDoc_STR("string helper module"),
15899 .m_size = 0,
15900 .m_methods = _string_methods,
15901 .m_slots = module_slots,
15902 };
15903
15904 PyMODINIT_FUNC
PyInit__string(void)15905 PyInit__string(void)
15906 {
15907 return PyModuleDef_Init(&_string_module);
15908 }
15909