1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h" // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45 #include "pycore_bytesobject.h" // _PyBytes_Repeat()
46 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47 #include "pycore_format.h" // F_LJUST
48 #include "pycore_initconfig.h" // _PyStatus_OK()
49 #include "pycore_interp.h" // PyInterpreterState.fs_codec
50 #include "pycore_long.h" // _PyLong_FormatWriter()
51 #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52 #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
53 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
54 #include "pycore_pystate.h" // _PyInterpreterState_GET()
55 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57 #include "stringlib/eq.h" // unicode_eq()
58
59 #ifdef MS_WINDOWS
60 #include <windows.h>
61 #endif
62
63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64 # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
65 #endif
66
67 /* Uncomment to display statistics on interned strings at exit
68 in _PyUnicode_ClearInterned(). */
69 /* #define INTERNED_STATS 1 */
70
71
72 /*[clinic input]
73 class str "PyObject *" "&PyUnicode_Type"
74 [clinic start generated code]*/
75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76
77 /*[python input]
78 class Py_UCS4_converter(CConverter):
79 type = 'Py_UCS4'
80 converter = 'convert_uc'
81
82 def converter_init(self):
83 if self.default is not unspecified:
84 self.c_default = ascii(self.default)
85 if len(self.c_default) > 4 or self.c_default[0] != "'":
86 self.c_default = hex(ord(self.default))
87
88 [python start generated code]*/
89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90
91 /* --- Globals ------------------------------------------------------------
92
93 NOTE: In the interpreter's initialization phase, some globals are currently
94 initialized dynamically as needed. In the process Unicode objects may
95 be created before the Unicode type is ready.
96
97 */
98
99
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107
108 #ifdef Py_DEBUG
109 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113
114 #define _PyUnicode_UTF8(op) \
115 (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 assert(PyUnicode_IS_READY(op)), \
119 PyUnicode_IS_COMPACT_ASCII(op) ? \
120 ((char*)(_PyASCIIObject_CAST(op) + 1)) : \
121 _PyUnicode_UTF8(op))
122 #define _PyUnicode_UTF8_LENGTH(op) \
123 (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124 #define PyUnicode_UTF8_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 assert(PyUnicode_IS_READY(op)), \
127 PyUnicode_IS_COMPACT_ASCII(op) ? \
128 _PyASCIIObject_CAST(op)->length : \
129 _PyUnicode_UTF8_LENGTH(op))
130 #define _PyUnicode_WSTR(op) \
131 (_PyASCIIObject_CAST(op)->wstr)
132
133 /* Don't use deprecated macro of unicodeobject.h */
134 #undef PyUnicode_WSTR_LENGTH
135 #define PyUnicode_WSTR_LENGTH(op) \
136 (PyUnicode_IS_COMPACT_ASCII(op) ? \
137 _PyASCIIObject_CAST(op)->length : \
138 _PyCompactUnicodeObject_CAST(op)->wstr_length)
139 #define _PyUnicode_WSTR_LENGTH(op) \
140 (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141 #define _PyUnicode_LENGTH(op) \
142 (_PyASCIIObject_CAST(op)->length)
143 #define _PyUnicode_STATE(op) \
144 (_PyASCIIObject_CAST(op)->state)
145 #define _PyUnicode_HASH(op) \
146 (_PyASCIIObject_CAST(op)->hash)
147 #define _PyUnicode_KIND(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 _PyASCIIObject_CAST(op)->state.kind)
150 #define _PyUnicode_GET_LENGTH(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 _PyASCIIObject_CAST(op)->length)
153 #define _PyUnicode_DATA_ANY(op) \
154 (_PyUnicodeObject_CAST(op)->data.any)
155
156 #undef PyUnicode_READY
157 #define PyUnicode_READY(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (PyUnicode_IS_READY(op) ? \
160 0 : \
161 _PyUnicode_Ready(op)))
162
163 #define _PyUnicode_SHARE_UTF8(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
166 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167 #define _PyUnicode_SHARE_WSTR(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170
171 /* true if the Unicode object has an allocated UTF-8 memory block
172 (not shared with other data) */
173 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
174 ((!PyUnicode_IS_COMPACT_ASCII(op) \
175 && _PyUnicode_UTF8(op) \
176 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177
178 /* true if the Unicode object has an allocated wstr memory block
179 (not shared with other data) */
180 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
181 ((_PyUnicode_WSTR(op) && \
182 (!PyUnicode_IS_READY(op) || \
183 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184
185 /* Generic helper macro to convert characters of different types.
186 from_type and to_type have to be valid type names, begin and end
187 are pointers to the source characters which should be of type
188 "from_type *". to is a pointer of type "to_type *" and points to the
189 buffer where the result characters are written to. */
190 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191 do { \
192 to_type *_to = (to_type *)(to); \
193 const from_type *_iter = (const from_type *)(begin);\
194 const from_type *_end = (const from_type *)(end);\
195 Py_ssize_t n = (_end) - (_iter); \
196 const from_type *_unrolled_end = \
197 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
198 while (_iter < (_unrolled_end)) { \
199 _to[0] = (to_type) _iter[0]; \
200 _to[1] = (to_type) _iter[1]; \
201 _to[2] = (to_type) _iter[2]; \
202 _to[3] = (to_type) _iter[3]; \
203 _iter += 4; _to += 4; \
204 } \
205 while (_iter < (_end)) \
206 *_to++ = (to_type) *_iter++; \
207 } while (0)
208
209 #define LATIN1(ch) \
210 (ch < 128 \
211 ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212 : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213
214 #ifdef MS_WINDOWS
215 /* On Windows, overallocate by 50% is the best factor */
216 # define OVERALLOCATE_FACTOR 2
217 #else
218 /* On Linux, overallocate by 25% is the best factor */
219 # define OVERALLOCATE_FACTOR 4
220 #endif
221
222 /* This dictionary holds all interned unicode strings. Note that references
223 to strings in this dictionary are *not* counted in the string's ob_refcnt.
224 When the interned string reaches a refcnt of 0 the string deallocation
225 function will delete the reference from this dictionary.
226
227 Another way to look at this is that to say that the actual reference
228 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
229 */
230 static PyObject *interned = NULL;
231
232 /* Forward declaration */
233 static inline int
234 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235 static inline void
236 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237 static PyObject *
238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239 const char *errors);
240 static PyObject *
241 unicode_decode_utf8(const char *s, Py_ssize_t size,
242 _Py_error_handler error_handler, const char *errors,
243 Py_ssize_t *consumed);
244 #ifdef Py_DEBUG
245 static inline int unicode_is_finalizing(void);
246 static int unicode_is_singleton(PyObject *unicode);
247 #endif
248
249
250 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)251 static inline PyObject* unicode_get_empty(void)
252 {
253 _Py_DECLARE_STR(empty, "");
254 return &_Py_STR(empty);
255 }
256
257
258 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)259 static inline PyObject* unicode_new_empty(void)
260 {
261 PyObject *empty = unicode_get_empty();
262 Py_INCREF(empty);
263 return empty;
264 }
265
266 #define _Py_RETURN_UNICODE_EMPTY() \
267 do { \
268 return unicode_new_empty(); \
269 } while (0)
270
271 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273 Py_ssize_t start, Py_ssize_t length)
274 {
275 assert(0 <= start);
276 assert(kind != PyUnicode_WCHAR_KIND);
277 switch (kind) {
278 case PyUnicode_1BYTE_KIND: {
279 assert(value <= 0xff);
280 Py_UCS1 ch = (unsigned char)value;
281 Py_UCS1 *to = (Py_UCS1 *)data + start;
282 memset(to, ch, length);
283 break;
284 }
285 case PyUnicode_2BYTE_KIND: {
286 assert(value <= 0xffff);
287 Py_UCS2 ch = (Py_UCS2)value;
288 Py_UCS2 *to = (Py_UCS2 *)data + start;
289 const Py_UCS2 *end = to + length;
290 for (; to < end; ++to) *to = ch;
291 break;
292 }
293 case PyUnicode_4BYTE_KIND: {
294 assert(value <= MAX_UNICODE);
295 Py_UCS4 ch = value;
296 Py_UCS4 * to = (Py_UCS4 *)data + start;
297 const Py_UCS4 *end = to + length;
298 for (; to < end; ++to) *to = ch;
299 break;
300 }
301 default: Py_UNREACHABLE();
302 }
303 }
304
305
306 /* Fast detection of the most frequent whitespace characters */
307 const unsigned char _Py_ascii_whitespace[] = {
308 0, 0, 0, 0, 0, 0, 0, 0,
309 /* case 0x0009: * CHARACTER TABULATION */
310 /* case 0x000A: * LINE FEED */
311 /* case 0x000B: * LINE TABULATION */
312 /* case 0x000C: * FORM FEED */
313 /* case 0x000D: * CARRIAGE RETURN */
314 0, 1, 1, 1, 1, 1, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 /* case 0x001C: * FILE SEPARATOR */
317 /* case 0x001D: * GROUP SEPARATOR */
318 /* case 0x001E: * RECORD SEPARATOR */
319 /* case 0x001F: * UNIT SEPARATOR */
320 0, 0, 0, 0, 1, 1, 1, 1,
321 /* case 0x0020: * SPACE */
322 1, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0
335 };
336
337 /* forward */
338 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339 static PyObject* get_latin1_char(unsigned char ch);
340 static int unicode_modifiable(PyObject *unicode);
341
342
343 static PyObject *
344 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345 static PyObject *
346 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347 static PyObject *
348 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349
350 static PyObject *
351 unicode_encode_call_errorhandler(const char *errors,
352 PyObject **errorHandler,const char *encoding, const char *reason,
353 PyObject *unicode, PyObject **exceptionObject,
354 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355
356 static void
357 raise_encode_exception(PyObject **exceptionObject,
358 const char *encoding,
359 PyObject *unicode,
360 Py_ssize_t startpos, Py_ssize_t endpos,
361 const char *reason);
362
363 /* Same for linebreaks */
364 static const unsigned char ascii_linebreak[] = {
365 0, 0, 0, 0, 0, 0, 0, 0,
366 /* 0x000A, * LINE FEED */
367 /* 0x000B, * LINE TABULATION */
368 /* 0x000C, * FORM FEED */
369 /* 0x000D, * CARRIAGE RETURN */
370 0, 0, 1, 1, 1, 1, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 /* 0x001C, * FILE SEPARATOR */
373 /* 0x001D, * GROUP SEPARATOR */
374 /* 0x001E, * RECORD SEPARATOR */
375 0, 0, 0, 0, 1, 1, 1, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0
389 };
390
391 static int convert_uc(PyObject *obj, void *addr);
392
393 struct encoding_map;
394 #include "clinic/unicodeobject.c.h"
395
396 _Py_error_handler
_Py_GetErrorHandler(const char * errors)397 _Py_GetErrorHandler(const char *errors)
398 {
399 if (errors == NULL || strcmp(errors, "strict") == 0) {
400 return _Py_ERROR_STRICT;
401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
403 return _Py_ERROR_SURROGATEESCAPE;
404 }
405 if (strcmp(errors, "replace") == 0) {
406 return _Py_ERROR_REPLACE;
407 }
408 if (strcmp(errors, "ignore") == 0) {
409 return _Py_ERROR_IGNORE;
410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
412 return _Py_ERROR_BACKSLASHREPLACE;
413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
415 return _Py_ERROR_SURROGATEPASS;
416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
418 return _Py_ERROR_XMLCHARREFREPLACE;
419 }
420 return _Py_ERROR_OTHER;
421 }
422
423
424 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)425 get_error_handler_wide(const wchar_t *errors)
426 {
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449 }
450
451
452 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)453 unicode_check_encoding_errors(const char *encoding, const char *errors)
454 {
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
459 PyInterpreterState *interp = _PyInterpreterState_GET();
460 #ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463 return 0;
464 }
465 #else
466 /* Always check in debug mode */
467 #endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471 if (!interp->unicode.fs_codec.encoding) {
472 return 0;
473 }
474
475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497 }
498
499
500 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)501 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
502 {
503 #define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
506 assert(op != NULL);
507 CHECK(PyUnicode_Check(op));
508
509 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510 unsigned int kind = ascii->state.kind;
511
512 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513 CHECK(kind == PyUnicode_1BYTE_KIND);
514 CHECK(ascii->state.ready == 1);
515 }
516 else {
517 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518 void *data;
519
520 if (ascii->state.compact == 1) {
521 data = compact + 1;
522 CHECK(kind == PyUnicode_1BYTE_KIND
523 || kind == PyUnicode_2BYTE_KIND
524 || kind == PyUnicode_4BYTE_KIND);
525 CHECK(ascii->state.ascii == 0);
526 CHECK(ascii->state.ready == 1);
527 CHECK(compact->utf8 != data);
528 }
529 else {
530 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531
532 data = unicode->data.any;
533 if (kind == PyUnicode_WCHAR_KIND) {
534 CHECK(ascii->length == 0);
535 CHECK(ascii->hash == -1);
536 CHECK(ascii->state.compact == 0);
537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 0);
539 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540 CHECK(ascii->wstr != NULL);
541 CHECK(data == NULL);
542 CHECK(compact->utf8 == NULL);
543 }
544 else {
545 CHECK(kind == PyUnicode_1BYTE_KIND
546 || kind == PyUnicode_2BYTE_KIND
547 || kind == PyUnicode_4BYTE_KIND);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ready == 1);
550 CHECK(data != NULL);
551 if (ascii->state.ascii) {
552 CHECK(compact->utf8 == data);
553 CHECK(compact->utf8_length == ascii->length);
554 }
555 else
556 CHECK(compact->utf8 != data);
557 }
558 }
559 if (kind != PyUnicode_WCHAR_KIND) {
560 if (
561 #if SIZEOF_WCHAR_T == 2
562 kind == PyUnicode_2BYTE_KIND
563 #else
564 kind == PyUnicode_4BYTE_KIND
565 #endif
566 )
567 {
568 CHECK(ascii->wstr == data);
569 CHECK(compact->wstr_length == ascii->length);
570 } else
571 CHECK(ascii->wstr != data);
572 }
573
574 if (compact->utf8 == NULL)
575 CHECK(compact->utf8_length == 0);
576 if (ascii->wstr == NULL)
577 CHECK(compact->wstr_length == 0);
578 }
579
580 /* check that the best kind is used: O(n) operation */
581 if (check_content && kind != PyUnicode_WCHAR_KIND) {
582 Py_ssize_t i;
583 Py_UCS4 maxchar = 0;
584 const void *data;
585 Py_UCS4 ch;
586
587 data = PyUnicode_DATA(ascii);
588 for (i=0; i < ascii->length; i++)
589 {
590 ch = PyUnicode_READ(kind, data, i);
591 if (ch > maxchar)
592 maxchar = ch;
593 }
594 if (kind == PyUnicode_1BYTE_KIND) {
595 if (ascii->state.ascii == 0) {
596 CHECK(maxchar >= 128);
597 CHECK(maxchar <= 255);
598 }
599 else
600 CHECK(maxchar < 128);
601 }
602 else if (kind == PyUnicode_2BYTE_KIND) {
603 CHECK(maxchar >= 0x100);
604 CHECK(maxchar <= 0xFFFF);
605 }
606 else {
607 CHECK(maxchar >= 0x10000);
608 CHECK(maxchar <= MAX_UNICODE);
609 }
610 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611 }
612 return 1;
613
614 #undef CHECK
615 }
616
617
618 static PyObject*
unicode_result_wchar(PyObject * unicode)619 unicode_result_wchar(PyObject *unicode)
620 {
621 #ifndef Py_DEBUG
622 Py_ssize_t len;
623
624 len = _PyUnicode_WSTR_LENGTH(unicode);
625 if (len == 0) {
626 Py_DECREF(unicode);
627 _Py_RETURN_UNICODE_EMPTY();
628 }
629
630 if (len == 1) {
631 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632 if ((Py_UCS4)ch < 256) {
633 Py_DECREF(unicode);
634 return get_latin1_char((unsigned char)ch);
635 }
636 }
637
638 if (_PyUnicode_Ready(unicode) < 0) {
639 Py_DECREF(unicode);
640 return NULL;
641 }
642 #else
643 assert(Py_REFCNT(unicode) == 1);
644
645 /* don't make the result ready in debug mode to ensure that the caller
646 makes the string ready before using it */
647 assert(_PyUnicode_CheckConsistency(unicode, 1));
648 #endif
649 return unicode;
650 }
651
652 static PyObject*
unicode_result_ready(PyObject * unicode)653 unicode_result_ready(PyObject *unicode)
654 {
655 Py_ssize_t length;
656
657 length = PyUnicode_GET_LENGTH(unicode);
658 if (length == 0) {
659 PyObject *empty = unicode_get_empty();
660 if (unicode != empty) {
661 Py_DECREF(unicode);
662 Py_INCREF(empty);
663 }
664 return empty;
665 }
666
667 if (length == 1) {
668 int kind = PyUnicode_KIND(unicode);
669 if (kind == PyUnicode_1BYTE_KIND) {
670 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671 Py_UCS1 ch = data[0];
672 PyObject *latin1_char = LATIN1(ch);
673 if (unicode != latin1_char) {
674 Py_INCREF(latin1_char);
675 Py_DECREF(unicode);
676 }
677 return latin1_char;
678 }
679 }
680
681 assert(_PyUnicode_CheckConsistency(unicode, 1));
682 return unicode;
683 }
684
685 static PyObject*
unicode_result(PyObject * unicode)686 unicode_result(PyObject *unicode)
687 {
688 assert(_PyUnicode_CHECK(unicode));
689 if (PyUnicode_IS_READY(unicode))
690 return unicode_result_ready(unicode);
691 else
692 return unicode_result_wchar(unicode);
693 }
694
695 static PyObject*
unicode_result_unchanged(PyObject * unicode)696 unicode_result_unchanged(PyObject *unicode)
697 {
698 if (PyUnicode_CheckExact(unicode)) {
699 if (PyUnicode_READY(unicode) == -1)
700 return NULL;
701 Py_INCREF(unicode);
702 return unicode;
703 }
704 else
705 /* Subtype -- return genuine unicode string with the same value. */
706 return _PyUnicode_Copy(unicode);
707 }
708
709 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710 ASCII, Latin1, UTF-8, etc. */
711 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712 backslashreplace(_PyBytesWriter *writer, char *str,
713 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714 {
715 Py_ssize_t size, i;
716 Py_UCS4 ch;
717 enum PyUnicode_Kind kind;
718 const void *data;
719
720 assert(PyUnicode_IS_READY(unicode));
721 kind = PyUnicode_KIND(unicode);
722 data = PyUnicode_DATA(unicode);
723
724 size = 0;
725 /* determine replacement size */
726 for (i = collstart; i < collend; ++i) {
727 Py_ssize_t incr;
728
729 ch = PyUnicode_READ(kind, data, i);
730 if (ch < 0x100)
731 incr = 2+2;
732 else if (ch < 0x10000)
733 incr = 2+4;
734 else {
735 assert(ch <= MAX_UNICODE);
736 incr = 2+8;
737 }
738 if (size > PY_SSIZE_T_MAX - incr) {
739 PyErr_SetString(PyExc_OverflowError,
740 "encoded result is too long for a Python string");
741 return NULL;
742 }
743 size += incr;
744 }
745
746 str = _PyBytesWriter_Prepare(writer, str, size);
747 if (str == NULL)
748 return NULL;
749
750 /* generate replacement */
751 for (i = collstart; i < collend; ++i) {
752 ch = PyUnicode_READ(kind, data, i);
753 *str++ = '\\';
754 if (ch >= 0x00010000) {
755 *str++ = 'U';
756 *str++ = Py_hexdigits[(ch>>28)&0xf];
757 *str++ = Py_hexdigits[(ch>>24)&0xf];
758 *str++ = Py_hexdigits[(ch>>20)&0xf];
759 *str++ = Py_hexdigits[(ch>>16)&0xf];
760 *str++ = Py_hexdigits[(ch>>12)&0xf];
761 *str++ = Py_hexdigits[(ch>>8)&0xf];
762 }
763 else if (ch >= 0x100) {
764 *str++ = 'u';
765 *str++ = Py_hexdigits[(ch>>12)&0xf];
766 *str++ = Py_hexdigits[(ch>>8)&0xf];
767 }
768 else
769 *str++ = 'x';
770 *str++ = Py_hexdigits[(ch>>4)&0xf];
771 *str++ = Py_hexdigits[ch&0xf];
772 }
773 return str;
774 }
775
776 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777 ASCII, Latin1, UTF-8, etc. */
778 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781 {
782 Py_ssize_t size, i;
783 Py_UCS4 ch;
784 enum PyUnicode_Kind kind;
785 const void *data;
786
787 assert(PyUnicode_IS_READY(unicode));
788 kind = PyUnicode_KIND(unicode);
789 data = PyUnicode_DATA(unicode);
790
791 size = 0;
792 /* determine replacement size */
793 for (i = collstart; i < collend; ++i) {
794 Py_ssize_t incr;
795
796 ch = PyUnicode_READ(kind, data, i);
797 if (ch < 10)
798 incr = 2+1+1;
799 else if (ch < 100)
800 incr = 2+2+1;
801 else if (ch < 1000)
802 incr = 2+3+1;
803 else if (ch < 10000)
804 incr = 2+4+1;
805 else if (ch < 100000)
806 incr = 2+5+1;
807 else if (ch < 1000000)
808 incr = 2+6+1;
809 else {
810 assert(ch <= MAX_UNICODE);
811 incr = 2+7+1;
812 }
813 if (size > PY_SSIZE_T_MAX - incr) {
814 PyErr_SetString(PyExc_OverflowError,
815 "encoded result is too long for a Python string");
816 return NULL;
817 }
818 size += incr;
819 }
820
821 str = _PyBytesWriter_Prepare(writer, str, size);
822 if (str == NULL)
823 return NULL;
824
825 /* generate replacement */
826 for (i = collstart; i < collend; ++i) {
827 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828 if (size < 0) {
829 return NULL;
830 }
831 str += size;
832 }
833 return str;
834 }
835
836 /* --- Bloom Filters ----------------------------------------------------- */
837
838 /* stuff to implement simple "bloom filters" for Unicode characters.
839 to keep things simple, we use a single bitmask, using the least 5
840 bits from each unicode characters as the bit index. */
841
842 /* the linebreak mask is set up by _PyUnicode_Init() below */
843
844 #if LONG_BIT >= 128
845 #define BLOOM_WIDTH 128
846 #elif LONG_BIT >= 64
847 #define BLOOM_WIDTH 64
848 #elif LONG_BIT >= 32
849 #define BLOOM_WIDTH 32
850 #else
851 #error "LONG_BIT is smaller than 32"
852 #endif
853
854 #define BLOOM_MASK unsigned long
855
856 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857
858 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859
860 #define BLOOM_LINEBREAK(ch) \
861 ((ch) < 128U ? ascii_linebreak[(ch)] : \
862 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863
864 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866 {
867 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
868 do { \
869 TYPE *data = (TYPE *)PTR; \
870 TYPE *end = data + LEN; \
871 Py_UCS4 ch; \
872 for (; data != end; data++) { \
873 ch = *data; \
874 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875 } \
876 break; \
877 } while (0)
878
879 /* calculate simple bloom-style bitmask for a given unicode string */
880
881 BLOOM_MASK mask;
882
883 mask = 0;
884 switch (kind) {
885 case PyUnicode_1BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887 break;
888 case PyUnicode_2BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890 break;
891 case PyUnicode_4BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893 break;
894 default:
895 Py_UNREACHABLE();
896 }
897 return mask;
898
899 #undef BLOOM_UPDATE
900 }
901
902 static int
ensure_unicode(PyObject * obj)903 ensure_unicode(PyObject *obj)
904 {
905 if (!PyUnicode_Check(obj)) {
906 PyErr_Format(PyExc_TypeError,
907 "must be str, not %.100s",
908 Py_TYPE(obj)->tp_name);
909 return -1;
910 }
911 return PyUnicode_READY(obj);
912 }
913
914 /* Compilation of templated routines */
915
916 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
917
918 #include "stringlib/asciilib.h"
919 #include "stringlib/fastsearch.h"
920 #include "stringlib/partition.h"
921 #include "stringlib/split.h"
922 #include "stringlib/count.h"
923 #include "stringlib/find.h"
924 #include "stringlib/find_max_char.h"
925 #include "stringlib/undef.h"
926
927 #include "stringlib/ucs1lib.h"
928 #include "stringlib/fastsearch.h"
929 #include "stringlib/partition.h"
930 #include "stringlib/split.h"
931 #include "stringlib/count.h"
932 #include "stringlib/find.h"
933 #include "stringlib/replace.h"
934 #include "stringlib/find_max_char.h"
935 #include "stringlib/undef.h"
936
937 #include "stringlib/ucs2lib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/replace.h"
944 #include "stringlib/find_max_char.h"
945 #include "stringlib/undef.h"
946
947 #include "stringlib/ucs4lib.h"
948 #include "stringlib/fastsearch.h"
949 #include "stringlib/partition.h"
950 #include "stringlib/split.h"
951 #include "stringlib/count.h"
952 #include "stringlib/find.h"
953 #include "stringlib/replace.h"
954 #include "stringlib/find_max_char.h"
955 #include "stringlib/undef.h"
956
957 _Py_COMP_DIAG_PUSH
958 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
959 #include "stringlib/unicodedefs.h"
960 #include "stringlib/fastsearch.h"
961 #include "stringlib/count.h"
962 #include "stringlib/find.h"
963 #include "stringlib/undef.h"
964 _Py_COMP_DIAG_POP
965
966 #undef STRINGLIB_GET_EMPTY
967
968 /* --- Unicode Object ----------------------------------------------------- */
969
970 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971 findchar(const void *s, int kind,
972 Py_ssize_t size, Py_UCS4 ch,
973 int direction)
974 {
975 switch (kind) {
976 case PyUnicode_1BYTE_KIND:
977 if ((Py_UCS1) ch != ch)
978 return -1;
979 if (direction > 0)
980 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981 else
982 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983 case PyUnicode_2BYTE_KIND:
984 if ((Py_UCS2) ch != ch)
985 return -1;
986 if (direction > 0)
987 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988 else
989 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990 case PyUnicode_4BYTE_KIND:
991 if (direction > 0)
992 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993 else
994 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995 default:
996 Py_UNREACHABLE();
997 }
998 }
999
1000 #ifdef Py_DEBUG
1001 /* Fill the data of a Unicode string with invalid characters to detect bugs
1002 earlier.
1003
1004 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006 invalid character in Unicode 6.0. */
1007 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009 {
1010 int kind = PyUnicode_KIND(unicode);
1011 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013 if (length <= old_length)
1014 return;
1015 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016 }
1017 #endif
1018
1019 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1020 resize_compact(PyObject *unicode, Py_ssize_t length)
1021 {
1022 Py_ssize_t char_size;
1023 Py_ssize_t struct_size;
1024 Py_ssize_t new_size;
1025 int share_wstr;
1026 PyObject *new_unicode;
1027 #ifdef Py_DEBUG
1028 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029 #endif
1030
1031 assert(unicode_modifiable(unicode));
1032 assert(PyUnicode_IS_READY(unicode));
1033 assert(PyUnicode_IS_COMPACT(unicode));
1034
1035 char_size = PyUnicode_KIND(unicode);
1036 if (PyUnicode_IS_ASCII(unicode))
1037 struct_size = sizeof(PyASCIIObject);
1038 else
1039 struct_size = sizeof(PyCompactUnicodeObject);
1040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041
1042 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043 PyErr_NoMemory();
1044 return NULL;
1045 }
1046 new_size = (struct_size + (length + 1) * char_size);
1047
1048 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049 PyObject_Free(_PyUnicode_UTF8(unicode));
1050 _PyUnicode_UTF8(unicode) = NULL;
1051 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052 }
1053 #ifdef Py_REF_DEBUG
1054 _Py_RefTotal--;
1055 #endif
1056 #ifdef Py_TRACE_REFS
1057 _Py_ForgetReference(unicode);
1058 #endif
1059
1060 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061 if (new_unicode == NULL) {
1062 _Py_NewReference(unicode);
1063 PyErr_NoMemory();
1064 return NULL;
1065 }
1066 unicode = new_unicode;
1067 _Py_NewReference(unicode);
1068
1069 _PyUnicode_LENGTH(unicode) = length;
1070 if (share_wstr) {
1071 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = length;
1074 }
1075 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076 PyObject_Free(_PyUnicode_WSTR(unicode));
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 if (!PyUnicode_IS_ASCII(unicode))
1079 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080 }
1081 #ifdef Py_DEBUG
1082 unicode_fill_invalid(unicode, old_length);
1083 #endif
1084 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085 length, 0);
1086 assert(_PyUnicode_CheckConsistency(unicode, 0));
1087 return unicode;
1088 }
1089
1090 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1091 resize_inplace(PyObject *unicode, Py_ssize_t length)
1092 {
1093 wchar_t *wstr;
1094 Py_ssize_t new_size;
1095 assert(!PyUnicode_IS_COMPACT(unicode));
1096 assert(Py_REFCNT(unicode) == 1);
1097
1098 if (PyUnicode_IS_READY(unicode)) {
1099 Py_ssize_t char_size;
1100 int share_wstr, share_utf8;
1101 void *data;
1102 #ifdef Py_DEBUG
1103 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104 #endif
1105
1106 data = _PyUnicode_DATA_ANY(unicode);
1107 char_size = PyUnicode_KIND(unicode);
1108 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110
1111 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112 PyErr_NoMemory();
1113 return -1;
1114 }
1115 new_size = (length + 1) * char_size;
1116
1117 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118 {
1119 PyObject_Free(_PyUnicode_UTF8(unicode));
1120 _PyUnicode_UTF8(unicode) = NULL;
1121 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122 }
1123
1124 data = (PyObject *)PyObject_Realloc(data, new_size);
1125 if (data == NULL) {
1126 PyErr_NoMemory();
1127 return -1;
1128 }
1129 _PyUnicode_DATA_ANY(unicode) = data;
1130 if (share_wstr) {
1131 _PyUnicode_WSTR(unicode) = data;
1132 _PyUnicode_WSTR_LENGTH(unicode) = length;
1133 }
1134 if (share_utf8) {
1135 _PyUnicode_UTF8(unicode) = data;
1136 _PyUnicode_UTF8_LENGTH(unicode) = length;
1137 }
1138 _PyUnicode_LENGTH(unicode) = length;
1139 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140 #ifdef Py_DEBUG
1141 unicode_fill_invalid(unicode, old_length);
1142 #endif
1143 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144 assert(_PyUnicode_CheckConsistency(unicode, 0));
1145 return 0;
1146 }
1147 }
1148 assert(_PyUnicode_WSTR(unicode) != NULL);
1149
1150 /* check for integer overflow */
1151 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152 PyErr_NoMemory();
1153 return -1;
1154 }
1155 new_size = sizeof(wchar_t) * (length + 1);
1156 wstr = _PyUnicode_WSTR(unicode);
1157 wstr = PyObject_Realloc(wstr, new_size);
1158 if (!wstr) {
1159 PyErr_NoMemory();
1160 return -1;
1161 }
1162 _PyUnicode_WSTR(unicode) = wstr;
1163 _PyUnicode_WSTR(unicode)[length] = 0;
1164 _PyUnicode_WSTR_LENGTH(unicode) = length;
1165 assert(_PyUnicode_CheckConsistency(unicode, 0));
1166 return 0;
1167 }
1168
1169 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1170 resize_copy(PyObject *unicode, Py_ssize_t length)
1171 {
1172 Py_ssize_t copy_length;
1173 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174 PyObject *copy;
1175
1176 assert(PyUnicode_IS_READY(unicode));
1177
1178 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179 if (copy == NULL)
1180 return NULL;
1181
1182 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184 return copy;
1185 }
1186 else {
1187 PyObject *w;
1188
1189 w = (PyObject*)_PyUnicode_New(length);
1190 if (w == NULL)
1191 return NULL;
1192 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193 copy_length = Py_MIN(copy_length, length);
1194 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195 copy_length * sizeof(wchar_t));
1196 return w;
1197 }
1198 }
1199
1200 /* We allocate one more byte to make sure the string is
1201 Ux0000 terminated; some code (e.g. new_identifier)
1202 relies on that.
1203
1204 XXX This allocator could further be enhanced by assuring that the
1205 free list never reduces its size below 1.
1206
1207 */
1208
1209 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1210 _PyUnicode_New(Py_ssize_t length)
1211 {
1212 PyUnicodeObject *unicode;
1213 size_t new_size;
1214
1215 /* Optimization for empty strings */
1216 if (length == 0) {
1217 return (PyUnicodeObject *)unicode_new_empty();
1218 }
1219
1220 /* Ensure we won't overflow the size. */
1221 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222 return (PyUnicodeObject *)PyErr_NoMemory();
1223 }
1224 if (length < 0) {
1225 PyErr_SetString(PyExc_SystemError,
1226 "Negative size passed to _PyUnicode_New");
1227 return NULL;
1228 }
1229
1230 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231 if (unicode == NULL)
1232 return NULL;
1233 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234
1235 _PyUnicode_WSTR_LENGTH(unicode) = length;
1236 _PyUnicode_HASH(unicode) = -1;
1237 _PyUnicode_STATE(unicode).interned = 0;
1238 _PyUnicode_STATE(unicode).kind = 0;
1239 _PyUnicode_STATE(unicode).compact = 0;
1240 _PyUnicode_STATE(unicode).ready = 0;
1241 _PyUnicode_STATE(unicode).ascii = 0;
1242 _PyUnicode_DATA_ANY(unicode) = NULL;
1243 _PyUnicode_LENGTH(unicode) = 0;
1244 _PyUnicode_UTF8(unicode) = NULL;
1245 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246
1247 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248 if (!_PyUnicode_WSTR(unicode)) {
1249 Py_DECREF(unicode);
1250 PyErr_NoMemory();
1251 return NULL;
1252 }
1253
1254 /* Initialize the first element to guard against cases where
1255 * the caller fails before initializing str -- unicode_resize()
1256 * reads str[0], and the Keep-Alive optimization can keep memory
1257 * allocated for str alive across a call to unicode_dealloc(unicode).
1258 * We don't want unicode_resize to read uninitialized memory in
1259 * that case.
1260 */
1261 _PyUnicode_WSTR(unicode)[0] = 0;
1262 _PyUnicode_WSTR(unicode)[length] = 0;
1263
1264 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265 return unicode;
1266 }
1267
1268 static const char*
unicode_kind_name(PyObject * unicode)1269 unicode_kind_name(PyObject *unicode)
1270 {
1271 /* don't check consistency: unicode_kind_name() is called from
1272 _PyUnicode_Dump() */
1273 if (!PyUnicode_IS_COMPACT(unicode))
1274 {
1275 if (!PyUnicode_IS_READY(unicode))
1276 return "wstr";
1277 switch (PyUnicode_KIND(unicode))
1278 {
1279 case PyUnicode_1BYTE_KIND:
1280 if (PyUnicode_IS_ASCII(unicode))
1281 return "legacy ascii";
1282 else
1283 return "legacy latin1";
1284 case PyUnicode_2BYTE_KIND:
1285 return "legacy UCS2";
1286 case PyUnicode_4BYTE_KIND:
1287 return "legacy UCS4";
1288 default:
1289 return "<legacy invalid kind>";
1290 }
1291 }
1292 assert(PyUnicode_IS_READY(unicode));
1293 switch (PyUnicode_KIND(unicode)) {
1294 case PyUnicode_1BYTE_KIND:
1295 if (PyUnicode_IS_ASCII(unicode))
1296 return "ascii";
1297 else
1298 return "latin1";
1299 case PyUnicode_2BYTE_KIND:
1300 return "UCS2";
1301 case PyUnicode_4BYTE_KIND:
1302 return "UCS4";
1303 default:
1304 return "<invalid compact kind>";
1305 }
1306 }
1307
1308 #ifdef Py_DEBUG
1309 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1310 const char *_PyUnicode_utf8(void *unicode_raw){
1311 PyObject *unicode = _PyObject_CAST(unicode_raw);
1312 return PyUnicode_UTF8(unicode);
1313 }
1314
_PyUnicode_compact_data(void * unicode_raw)1315 const void *_PyUnicode_compact_data(void *unicode_raw) {
1316 PyObject *unicode = _PyObject_CAST(unicode_raw);
1317 return _PyUnicode_COMPACT_DATA(unicode);
1318 }
_PyUnicode_data(void * unicode_raw)1319 const void *_PyUnicode_data(void *unicode_raw) {
1320 PyObject *unicode = _PyObject_CAST(unicode_raw);
1321 printf("obj %p\n", (void*)unicode);
1322 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327 return PyUnicode_DATA(unicode);
1328 }
1329
1330 void
_PyUnicode_Dump(PyObject * op)1331 _PyUnicode_Dump(PyObject *op)
1332 {
1333 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336 const void *data;
1337
1338 if (ascii->state.compact)
1339 {
1340 if (ascii->state.ascii)
1341 data = (ascii + 1);
1342 else
1343 data = (compact + 1);
1344 }
1345 else
1346 data = unicode->data.any;
1347 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348
1349 if (ascii->wstr == data)
1350 printf("shared ");
1351 printf("wstr=%p", (void *)ascii->wstr);
1352
1353 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354 printf(" (%zu), ", compact->wstr_length);
1355 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356 printf("shared ");
1357 }
1358 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359 }
1360 printf(", data=%p\n", data);
1361 }
1362 #endif
1363
1364
1365 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367 {
1368 /* Optimization for empty strings */
1369 if (size == 0) {
1370 return unicode_new_empty();
1371 }
1372
1373 PyObject *obj;
1374 PyCompactUnicodeObject *unicode;
1375 void *data;
1376 enum PyUnicode_Kind kind;
1377 int is_sharing, is_ascii;
1378 Py_ssize_t char_size;
1379 Py_ssize_t struct_size;
1380
1381 is_ascii = 0;
1382 is_sharing = 0;
1383 struct_size = sizeof(PyCompactUnicodeObject);
1384 if (maxchar < 128) {
1385 kind = PyUnicode_1BYTE_KIND;
1386 char_size = 1;
1387 is_ascii = 1;
1388 struct_size = sizeof(PyASCIIObject);
1389 }
1390 else if (maxchar < 256) {
1391 kind = PyUnicode_1BYTE_KIND;
1392 char_size = 1;
1393 }
1394 else if (maxchar < 65536) {
1395 kind = PyUnicode_2BYTE_KIND;
1396 char_size = 2;
1397 if (sizeof(wchar_t) == 2)
1398 is_sharing = 1;
1399 }
1400 else {
1401 if (maxchar > MAX_UNICODE) {
1402 PyErr_SetString(PyExc_SystemError,
1403 "invalid maximum character passed to PyUnicode_New");
1404 return NULL;
1405 }
1406 kind = PyUnicode_4BYTE_KIND;
1407 char_size = 4;
1408 if (sizeof(wchar_t) == 4)
1409 is_sharing = 1;
1410 }
1411
1412 /* Ensure we won't overflow the size. */
1413 if (size < 0) {
1414 PyErr_SetString(PyExc_SystemError,
1415 "Negative size passed to PyUnicode_New");
1416 return NULL;
1417 }
1418 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419 return PyErr_NoMemory();
1420
1421 /* Duplicated allocation code from _PyObject_New() instead of a call to
1422 * PyObject_New() so we are able to allocate space for the object and
1423 * it's data buffer.
1424 */
1425 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426 if (obj == NULL) {
1427 return PyErr_NoMemory();
1428 }
1429 _PyObject_Init(obj, &PyUnicode_Type);
1430
1431 unicode = (PyCompactUnicodeObject *)obj;
1432 if (is_ascii)
1433 data = ((PyASCIIObject*)obj) + 1;
1434 else
1435 data = unicode + 1;
1436 _PyUnicode_LENGTH(unicode) = size;
1437 _PyUnicode_HASH(unicode) = -1;
1438 _PyUnicode_STATE(unicode).interned = 0;
1439 _PyUnicode_STATE(unicode).kind = kind;
1440 _PyUnicode_STATE(unicode).compact = 1;
1441 _PyUnicode_STATE(unicode).ready = 1;
1442 _PyUnicode_STATE(unicode).ascii = is_ascii;
1443 if (is_ascii) {
1444 ((char*)data)[size] = 0;
1445 _PyUnicode_WSTR(unicode) = NULL;
1446 }
1447 else if (kind == PyUnicode_1BYTE_KIND) {
1448 ((char*)data)[size] = 0;
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 unicode->utf8 = NULL;
1452 unicode->utf8_length = 0;
1453 }
1454 else {
1455 unicode->utf8 = NULL;
1456 unicode->utf8_length = 0;
1457 if (kind == PyUnicode_2BYTE_KIND)
1458 ((Py_UCS2*)data)[size] = 0;
1459 else /* kind == PyUnicode_4BYTE_KIND */
1460 ((Py_UCS4*)data)[size] = 0;
1461 if (is_sharing) {
1462 _PyUnicode_WSTR_LENGTH(unicode) = size;
1463 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464 }
1465 else {
1466 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
1469 }
1470 #ifdef Py_DEBUG
1471 unicode_fill_invalid((PyObject*)unicode, 0);
1472 #endif
1473 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474 return obj;
1475 }
1476
1477 #if SIZEOF_WCHAR_T == 2
1478 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479 will decode surrogate pairs, the other conversions are implemented as macros
1480 for efficiency.
1481
1482 This function assumes that unicode can hold one more code point than wstr
1483 characters for a terminating null character. */
1484 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486 PyObject *unicode)
1487 {
1488 const wchar_t *iter;
1489 Py_UCS4 *ucs4_out;
1490
1491 assert(unicode != NULL);
1492 assert(_PyUnicode_CHECK(unicode));
1493 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495
1496 for (iter = begin; iter < end; ) {
1497 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498 _PyUnicode_GET_LENGTH(unicode)));
1499 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500 && (iter+1) < end
1501 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502 {
1503 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504 iter += 2;
1505 }
1506 else {
1507 *ucs4_out++ = *iter;
1508 iter++;
1509 }
1510 }
1511 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
1513
1514 }
1515 #endif
1516
1517 static int
unicode_check_modifiable(PyObject * unicode)1518 unicode_check_modifiable(PyObject *unicode)
1519 {
1520 if (!unicode_modifiable(unicode)) {
1521 PyErr_SetString(PyExc_SystemError,
1522 "Cannot modify a string currently used");
1523 return -1;
1524 }
1525 return 0;
1526 }
1527
1528 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529 _copy_characters(PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start,
1531 Py_ssize_t how_many, int check_maxchar)
1532 {
1533 unsigned int from_kind, to_kind;
1534 const void *from_data;
1535 void *to_data;
1536
1537 assert(0 <= how_many);
1538 assert(0 <= from_start);
1539 assert(0 <= to_start);
1540 assert(PyUnicode_Check(from));
1541 assert(PyUnicode_IS_READY(from));
1542 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543
1544 assert(PyUnicode_Check(to));
1545 assert(PyUnicode_IS_READY(to));
1546 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547
1548 if (how_many == 0)
1549 return 0;
1550
1551 from_kind = PyUnicode_KIND(from);
1552 from_data = PyUnicode_DATA(from);
1553 to_kind = PyUnicode_KIND(to);
1554 to_data = PyUnicode_DATA(to);
1555
1556 #ifdef Py_DEBUG
1557 if (!check_maxchar
1558 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559 {
1560 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561 Py_UCS4 ch;
1562 Py_ssize_t i;
1563 for (i=0; i < how_many; i++) {
1564 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565 assert(ch <= to_maxchar);
1566 }
1567 }
1568 #endif
1569
1570 if (from_kind == to_kind) {
1571 if (check_maxchar
1572 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573 {
1574 /* Writing Latin-1 characters into an ASCII string requires to
1575 check that all written characters are pure ASCII */
1576 Py_UCS4 max_char;
1577 max_char = ucs1lib_find_max_char(from_data,
1578 (const Py_UCS1*)from_data + how_many);
1579 if (max_char >= 128)
1580 return -1;
1581 }
1582 memcpy((char*)to_data + to_kind * to_start,
1583 (const char*)from_data + from_kind * from_start,
1584 to_kind * how_many);
1585 }
1586 else if (from_kind == PyUnicode_1BYTE_KIND
1587 && to_kind == PyUnicode_2BYTE_KIND)
1588 {
1589 _PyUnicode_CONVERT_BYTES(
1590 Py_UCS1, Py_UCS2,
1591 PyUnicode_1BYTE_DATA(from) + from_start,
1592 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593 PyUnicode_2BYTE_DATA(to) + to_start
1594 );
1595 }
1596 else if (from_kind == PyUnicode_1BYTE_KIND
1597 && to_kind == PyUnicode_4BYTE_KIND)
1598 {
1599 _PyUnicode_CONVERT_BYTES(
1600 Py_UCS1, Py_UCS4,
1601 PyUnicode_1BYTE_DATA(from) + from_start,
1602 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603 PyUnicode_4BYTE_DATA(to) + to_start
1604 );
1605 }
1606 else if (from_kind == PyUnicode_2BYTE_KIND
1607 && to_kind == PyUnicode_4BYTE_KIND)
1608 {
1609 _PyUnicode_CONVERT_BYTES(
1610 Py_UCS2, Py_UCS4,
1611 PyUnicode_2BYTE_DATA(from) + from_start,
1612 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613 PyUnicode_4BYTE_DATA(to) + to_start
1614 );
1615 }
1616 else {
1617 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618
1619 if (!check_maxchar) {
1620 if (from_kind == PyUnicode_2BYTE_KIND
1621 && to_kind == PyUnicode_1BYTE_KIND)
1622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS1,
1625 PyUnicode_2BYTE_DATA(from) + from_start,
1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_1BYTE_DATA(to) + to_start
1628 );
1629 }
1630 else if (from_kind == PyUnicode_4BYTE_KIND
1631 && to_kind == PyUnicode_1BYTE_KIND)
1632 {
1633 _PyUnicode_CONVERT_BYTES(
1634 Py_UCS4, Py_UCS1,
1635 PyUnicode_4BYTE_DATA(from) + from_start,
1636 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637 PyUnicode_1BYTE_DATA(to) + to_start
1638 );
1639 }
1640 else if (from_kind == PyUnicode_4BYTE_KIND
1641 && to_kind == PyUnicode_2BYTE_KIND)
1642 {
1643 _PyUnicode_CONVERT_BYTES(
1644 Py_UCS4, Py_UCS2,
1645 PyUnicode_4BYTE_DATA(from) + from_start,
1646 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647 PyUnicode_2BYTE_DATA(to) + to_start
1648 );
1649 }
1650 else {
1651 Py_UNREACHABLE();
1652 }
1653 }
1654 else {
1655 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656 Py_UCS4 ch;
1657 Py_ssize_t i;
1658
1659 for (i=0; i < how_many; i++) {
1660 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661 if (ch > to_maxchar)
1662 return -1;
1663 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664 }
1665 }
1666 }
1667 return 0;
1668 }
1669
1670 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671 _PyUnicode_FastCopyCharacters(
1672 PyObject *to, Py_ssize_t to_start,
1673 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674 {
1675 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676 }
1677
1678 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680 PyObject *from, Py_ssize_t from_start,
1681 Py_ssize_t how_many)
1682 {
1683 int err;
1684
1685 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686 PyErr_BadInternalCall();
1687 return -1;
1688 }
1689
1690 if (PyUnicode_READY(from) == -1)
1691 return -1;
1692 if (PyUnicode_READY(to) == -1)
1693 return -1;
1694
1695 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696 PyErr_SetString(PyExc_IndexError, "string index out of range");
1697 return -1;
1698 }
1699 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700 PyErr_SetString(PyExc_IndexError, "string index out of range");
1701 return -1;
1702 }
1703 if (how_many < 0) {
1704 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705 return -1;
1706 }
1707 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709 PyErr_Format(PyExc_SystemError,
1710 "Cannot write %zi characters at %zi "
1711 "in a string of %zi characters",
1712 how_many, to_start, PyUnicode_GET_LENGTH(to));
1713 return -1;
1714 }
1715
1716 if (how_many == 0)
1717 return 0;
1718
1719 if (unicode_check_modifiable(to))
1720 return -1;
1721
1722 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723 if (err) {
1724 PyErr_Format(PyExc_SystemError,
1725 "Cannot copy %s characters "
1726 "into a string of %s characters",
1727 unicode_kind_name(from),
1728 unicode_kind_name(to));
1729 return -1;
1730 }
1731 return how_many;
1732 }
1733
1734 /* Find the maximum code point and count the number of surrogate pairs so a
1735 correct string length can be computed before converting a string to UCS4.
1736 This function counts single surrogates as a character and not as a pair.
1737
1738 Return 0 on success, or -1 on error. */
1739 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742 {
1743 const wchar_t *iter;
1744 Py_UCS4 ch;
1745
1746 assert(num_surrogates != NULL && maxchar != NULL);
1747 *num_surrogates = 0;
1748 *maxchar = 0;
1749
1750 for (iter = begin; iter < end; ) {
1751 #if SIZEOF_WCHAR_T == 2
1752 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753 && (iter+1) < end
1754 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755 {
1756 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757 ++(*num_surrogates);
1758 iter += 2;
1759 }
1760 else
1761 #endif
1762 {
1763 ch = *iter;
1764 iter++;
1765 }
1766 if (ch > *maxchar) {
1767 *maxchar = ch;
1768 if (*maxchar > MAX_UNICODE) {
1769 PyErr_Format(PyExc_ValueError,
1770 "character U+%x is not in range [U+0000; U+%x]",
1771 ch, MAX_UNICODE);
1772 return -1;
1773 }
1774 }
1775 }
1776 return 0;
1777 }
1778
1779 int
_PyUnicode_Ready(PyObject * unicode)1780 _PyUnicode_Ready(PyObject *unicode)
1781 {
1782 wchar_t *end;
1783 Py_UCS4 maxchar = 0;
1784 Py_ssize_t num_surrogates;
1785 #if SIZEOF_WCHAR_T == 2
1786 Py_ssize_t length_wo_surrogates;
1787 #endif
1788
1789 /* _PyUnicode_Ready() is only intended for old-style API usage where
1790 strings were created using _PyObject_New() and where no canonical
1791 representation (the str field) has been set yet aka strings
1792 which are not yet ready. */
1793 assert(_PyUnicode_CHECK(unicode));
1794 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795 assert(_PyUnicode_WSTR(unicode) != NULL);
1796 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797 assert(_PyUnicode_UTF8(unicode) == NULL);
1798 /* Actually, it should neither be interned nor be anything else: */
1799 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800
1801 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803 &maxchar, &num_surrogates) == -1)
1804 return -1;
1805
1806 if (maxchar < 256) {
1807 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808 if (!_PyUnicode_DATA_ANY(unicode)) {
1809 PyErr_NoMemory();
1810 return -1;
1811 }
1812 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813 _PyUnicode_WSTR(unicode), end,
1814 PyUnicode_1BYTE_DATA(unicode));
1815 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818 if (maxchar < 128) {
1819 _PyUnicode_STATE(unicode).ascii = 1;
1820 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822 }
1823 else {
1824 _PyUnicode_STATE(unicode).ascii = 0;
1825 _PyUnicode_UTF8(unicode) = NULL;
1826 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827 }
1828 PyObject_Free(_PyUnicode_WSTR(unicode));
1829 _PyUnicode_WSTR(unicode) = NULL;
1830 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831 }
1832 /* In this case we might have to convert down from 4-byte native
1833 wchar_t to 2-byte unicode. */
1834 else if (maxchar < 65536) {
1835 assert(num_surrogates == 0 &&
1836 "FindMaxCharAndNumSurrogatePairs() messed up");
1837
1838 #if SIZEOF_WCHAR_T == 2
1839 /* We can share representations and are done. */
1840 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844 _PyUnicode_UTF8(unicode) = NULL;
1845 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846 #else
1847 /* sizeof(wchar_t) == 4 */
1848 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850 if (!_PyUnicode_DATA_ANY(unicode)) {
1851 PyErr_NoMemory();
1852 return -1;
1853 }
1854 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855 _PyUnicode_WSTR(unicode), end,
1856 PyUnicode_2BYTE_DATA(unicode));
1857 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860 _PyUnicode_UTF8(unicode) = NULL;
1861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862 PyObject_Free(_PyUnicode_WSTR(unicode));
1863 _PyUnicode_WSTR(unicode) = NULL;
1864 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865 #endif
1866 }
1867 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868 else {
1869 #if SIZEOF_WCHAR_T == 2
1870 /* in case the native representation is 2-bytes, we need to allocate a
1871 new normalized 4-byte version. */
1872 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874 PyErr_NoMemory();
1875 return -1;
1876 }
1877 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878 if (!_PyUnicode_DATA_ANY(unicode)) {
1879 PyErr_NoMemory();
1880 return -1;
1881 }
1882 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884 _PyUnicode_UTF8(unicode) = NULL;
1885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887 _PyUnicode_STATE(unicode).ready = 1;
1888 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889 PyObject_Free(_PyUnicode_WSTR(unicode));
1890 _PyUnicode_WSTR(unicode) = NULL;
1891 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892 #else
1893 assert(num_surrogates == 0);
1894
1895 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897 _PyUnicode_UTF8(unicode) = NULL;
1898 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900 #endif
1901 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902 }
1903 _PyUnicode_STATE(unicode).ready = 1;
1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1905 return 0;
1906 }
1907
1908 static void
unicode_dealloc(PyObject * unicode)1909 unicode_dealloc(PyObject *unicode)
1910 {
1911 #ifdef Py_DEBUG
1912 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913 _Py_FatalRefcountError("deallocating an Unicode singleton");
1914 }
1915 #endif
1916
1917 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918 case SSTATE_NOT_INTERNED:
1919 break;
1920 case SSTATE_INTERNED_MORTAL:
1921 {
1922 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923 references (key and value) which were ignored by
1924 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925 to prevent calling unicode_dealloc() again. Adjust refcnt after
1926 PyDict_DelItem(). */
1927 assert(Py_REFCNT(unicode) == 0);
1928 Py_SET_REFCNT(unicode, 3);
1929 if (PyDict_DelItem(interned, unicode) != 0) {
1930 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931 NULL);
1932 }
1933 assert(Py_REFCNT(unicode) == 1);
1934 Py_SET_REFCNT(unicode, 0);
1935 break;
1936 }
1937
1938 case SSTATE_INTERNED_IMMORTAL:
1939 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940 break;
1941
1942 default:
1943 Py_UNREACHABLE();
1944 }
1945
1946 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947 PyObject_Free(_PyUnicode_WSTR(unicode));
1948 }
1949 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950 PyObject_Free(_PyUnicode_UTF8(unicode));
1951 }
1952 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954 }
1955
1956 Py_TYPE(unicode)->tp_free(unicode);
1957 }
1958
1959 #ifdef Py_DEBUG
1960 static int
unicode_is_singleton(PyObject * unicode)1961 unicode_is_singleton(PyObject *unicode)
1962 {
1963 if (unicode == &_Py_STR(empty)) {
1964 return 1;
1965 }
1966
1967 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970 if (ch < 256 && LATIN1(ch) == unicode) {
1971 return 1;
1972 }
1973 }
1974 return 0;
1975 }
1976 #endif
1977
1978 static int
unicode_modifiable(PyObject * unicode)1979 unicode_modifiable(PyObject *unicode)
1980 {
1981 assert(_PyUnicode_CHECK(unicode));
1982 if (Py_REFCNT(unicode) != 1)
1983 return 0;
1984 if (_PyUnicode_HASH(unicode) != -1)
1985 return 0;
1986 if (PyUnicode_CHECK_INTERNED(unicode))
1987 return 0;
1988 if (!PyUnicode_CheckExact(unicode))
1989 return 0;
1990 #ifdef Py_DEBUG
1991 /* singleton refcount is greater than 1 */
1992 assert(!unicode_is_singleton(unicode));
1993 #endif
1994 return 1;
1995 }
1996
1997 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999 {
2000 PyObject *unicode;
2001 Py_ssize_t old_length;
2002
2003 assert(p_unicode != NULL);
2004 unicode = *p_unicode;
2005
2006 assert(unicode != NULL);
2007 assert(PyUnicode_Check(unicode));
2008 assert(0 <= length);
2009
2010 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011 old_length = PyUnicode_WSTR_LENGTH(unicode);
2012 else
2013 old_length = PyUnicode_GET_LENGTH(unicode);
2014 if (old_length == length)
2015 return 0;
2016
2017 if (length == 0) {
2018 PyObject *empty = unicode_new_empty();
2019 Py_SETREF(*p_unicode, empty);
2020 return 0;
2021 }
2022
2023 if (!unicode_modifiable(unicode)) {
2024 PyObject *copy = resize_copy(unicode, length);
2025 if (copy == NULL)
2026 return -1;
2027 Py_SETREF(*p_unicode, copy);
2028 return 0;
2029 }
2030
2031 if (PyUnicode_IS_COMPACT(unicode)) {
2032 PyObject *new_unicode = resize_compact(unicode, length);
2033 if (new_unicode == NULL)
2034 return -1;
2035 *p_unicode = new_unicode;
2036 return 0;
2037 }
2038 return resize_inplace(unicode, length);
2039 }
2040
2041 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043 {
2044 PyObject *unicode;
2045 if (p_unicode == NULL) {
2046 PyErr_BadInternalCall();
2047 return -1;
2048 }
2049 unicode = *p_unicode;
2050 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051 {
2052 PyErr_BadInternalCall();
2053 return -1;
2054 }
2055 return unicode_resize(p_unicode, length);
2056 }
2057
2058 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059
2060 WARNING: The function doesn't copy the terminating null character and
2061 doesn't check the maximum character (may write a latin1 character in an
2062 ASCII string). */
2063 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065 const char *str, Py_ssize_t len)
2066 {
2067 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068 const void *data = PyUnicode_DATA(unicode);
2069 const char *end = str + len;
2070
2071 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072 switch (kind) {
2073 case PyUnicode_1BYTE_KIND: {
2074 #ifdef Py_DEBUG
2075 if (PyUnicode_IS_ASCII(unicode)) {
2076 Py_UCS4 maxchar = ucs1lib_find_max_char(
2077 (const Py_UCS1*)str,
2078 (const Py_UCS1*)str + len);
2079 assert(maxchar < 128);
2080 }
2081 #endif
2082 memcpy((char *) data + index, str, len);
2083 break;
2084 }
2085 case PyUnicode_2BYTE_KIND: {
2086 Py_UCS2 *start = (Py_UCS2 *)data + index;
2087 Py_UCS2 *ucs2 = start;
2088
2089 for (; str < end; ++ucs2, ++str)
2090 *ucs2 = (Py_UCS2)*str;
2091
2092 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093 break;
2094 }
2095 case PyUnicode_4BYTE_KIND: {
2096 Py_UCS4 *start = (Py_UCS4 *)data + index;
2097 Py_UCS4 *ucs4 = start;
2098
2099 for (; str < end; ++ucs4, ++str)
2100 *ucs4 = (Py_UCS4)*str;
2101
2102 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103 break;
2104 }
2105 default:
2106 Py_UNREACHABLE();
2107 }
2108 }
2109
2110 static PyObject*
get_latin1_char(Py_UCS1 ch)2111 get_latin1_char(Py_UCS1 ch)
2112 {
2113 return Py_NewRef(LATIN1(ch));
2114 }
2115
2116 static PyObject*
unicode_char(Py_UCS4 ch)2117 unicode_char(Py_UCS4 ch)
2118 {
2119 PyObject *unicode;
2120
2121 assert(ch <= MAX_UNICODE);
2122
2123 if (ch < 256) {
2124 return get_latin1_char(ch);
2125 }
2126
2127 unicode = PyUnicode_New(1, ch);
2128 if (unicode == NULL)
2129 return NULL;
2130
2131 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134 } else {
2135 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137 }
2138 assert(_PyUnicode_CheckConsistency(unicode, 1));
2139 return unicode;
2140 }
2141
2142 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144 {
2145 if (u == NULL) {
2146 if (size > 0) {
2147 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149 "use PyUnicode_New() instead", 1) < 0) {
2150 return NULL;
2151 }
2152 }
2153 return (PyObject*)_PyUnicode_New(size);
2154 }
2155
2156 if (size < 0) {
2157 PyErr_BadInternalCall();
2158 return NULL;
2159 }
2160
2161 return PyUnicode_FromWideChar(u, size);
2162 }
2163
2164 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166 {
2167 PyObject *unicode;
2168 Py_UCS4 maxchar = 0;
2169 Py_ssize_t num_surrogates;
2170
2171 if (u == NULL && size != 0) {
2172 PyErr_BadInternalCall();
2173 return NULL;
2174 }
2175
2176 if (size == -1) {
2177 size = wcslen(u);
2178 }
2179
2180 /* If the Unicode data is known at construction time, we can apply
2181 some optimizations which share commonly used objects. */
2182
2183 /* Optimization for empty strings */
2184 if (size == 0)
2185 _Py_RETURN_UNICODE_EMPTY();
2186
2187 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189 non-Unicode locales and hence needs conversion to UCS-4 first. */
2190 if (_Py_LocaleUsesNonUnicodeWchar()) {
2191 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192 if (!converted) {
2193 return NULL;
2194 }
2195 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196 PyMem_Free(converted);
2197 return unicode;
2198 }
2199 #endif
2200
2201 /* Single character Unicode objects in the Latin-1 range are
2202 shared when using this constructor */
2203 if (size == 1 && (Py_UCS4)*u < 256)
2204 return get_latin1_char((unsigned char)*u);
2205
2206 /* If not empty and not single character, copy the Unicode data
2207 into the new object */
2208 if (find_maxchar_surrogates(u, u + size,
2209 &maxchar, &num_surrogates) == -1)
2210 return NULL;
2211
2212 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213 if (!unicode)
2214 return NULL;
2215
2216 switch (PyUnicode_KIND(unicode)) {
2217 case PyUnicode_1BYTE_KIND:
2218 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220 break;
2221 case PyUnicode_2BYTE_KIND:
2222 #if Py_UNICODE_SIZE == 2
2223 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224 #else
2225 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227 #endif
2228 break;
2229 case PyUnicode_4BYTE_KIND:
2230 #if SIZEOF_WCHAR_T == 2
2231 /* This is the only case which has to process surrogates, thus
2232 a simple copy loop is not enough and we need a function. */
2233 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234 #else
2235 assert(num_surrogates == 0);
2236 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237 #endif
2238 break;
2239 default:
2240 Py_UNREACHABLE();
2241 }
2242
2243 return unicode_result(unicode);
2244 }
2245
2246 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248 {
2249 if (size < 0) {
2250 PyErr_SetString(PyExc_SystemError,
2251 "Negative size passed to PyUnicode_FromStringAndSize");
2252 return NULL;
2253 }
2254 if (u != NULL) {
2255 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256 }
2257 else {
2258 if (size > 0) {
2259 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261 "use PyUnicode_New() instead", 1) < 0) {
2262 return NULL;
2263 }
2264 }
2265 return (PyObject *)_PyUnicode_New(size);
2266 }
2267 }
2268
2269 PyObject *
PyUnicode_FromString(const char * u)2270 PyUnicode_FromString(const char *u)
2271 {
2272 size_t size = strlen(u);
2273 if (size > PY_SSIZE_T_MAX) {
2274 PyErr_SetString(PyExc_OverflowError, "input too long");
2275 return NULL;
2276 }
2277 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278 }
2279
2280
2281 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2282 _PyUnicode_FromId(_Py_Identifier *id)
2283 {
2284 PyInterpreterState *interp = _PyInterpreterState_GET();
2285 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286
2287 Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288 if (index < 0) {
2289 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290
2291 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292 // Check again to detect concurrent access. Another thread can have
2293 // initialized the index while this thread waited for the lock.
2294 index = _Py_atomic_size_get(&id->index);
2295 if (index < 0) {
2296 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297 index = rt_ids->next_index;
2298 rt_ids->next_index++;
2299 _Py_atomic_size_set(&id->index, index);
2300 }
2301 PyThread_release_lock(rt_ids->lock);
2302 }
2303 assert(index >= 0);
2304
2305 PyObject *obj;
2306 if (index < ids->size) {
2307 obj = ids->array[index];
2308 if (obj) {
2309 // Return a borrowed reference
2310 return obj;
2311 }
2312 }
2313
2314 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315 NULL, NULL);
2316 if (!obj) {
2317 return NULL;
2318 }
2319 PyUnicode_InternInPlace(&obj);
2320
2321 if (index >= ids->size) {
2322 // Overallocate to reduce the number of realloc
2323 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324 Py_ssize_t item_size = sizeof(ids->array[0]);
2325 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326 if (new_array == NULL) {
2327 PyErr_NoMemory();
2328 return NULL;
2329 }
2330 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331 ids->array = new_array;
2332 ids->size = new_size;
2333 }
2334
2335 // The array stores a strong reference
2336 ids->array[index] = obj;
2337
2338 // Return a borrowed reference
2339 return obj;
2340 }
2341
2342
2343 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2344 unicode_clear_identifiers(struct _Py_unicode_state *state)
2345 {
2346 struct _Py_unicode_ids *ids = &state->ids;
2347 for (Py_ssize_t i=0; i < ids->size; i++) {
2348 Py_XDECREF(ids->array[i]);
2349 }
2350 ids->size = 0;
2351 PyMem_Free(ids->array);
2352 ids->array = NULL;
2353 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354 // after Py_Finalize().
2355 }
2356
2357
2358 /* Internal function, doesn't check maximum character */
2359
2360 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362 {
2363 const unsigned char *s = (const unsigned char *)buffer;
2364 PyObject *unicode;
2365 if (size == 1) {
2366 #ifdef Py_DEBUG
2367 assert((unsigned char)s[0] < 128);
2368 #endif
2369 return get_latin1_char(s[0]);
2370 }
2371 unicode = PyUnicode_New(size, 127);
2372 if (!unicode)
2373 return NULL;
2374 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375 assert(_PyUnicode_CheckConsistency(unicode, 1));
2376 return unicode;
2377 }
2378
2379 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2380 kind_maxchar_limit(unsigned int kind)
2381 {
2382 switch (kind) {
2383 case PyUnicode_1BYTE_KIND:
2384 return 0x80;
2385 case PyUnicode_2BYTE_KIND:
2386 return 0x100;
2387 case PyUnicode_4BYTE_KIND:
2388 return 0x10000;
2389 default:
2390 Py_UNREACHABLE();
2391 }
2392 }
2393
2394 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396 {
2397 PyObject *res;
2398 unsigned char max_char;
2399
2400 if (size == 0) {
2401 _Py_RETURN_UNICODE_EMPTY();
2402 }
2403 assert(size > 0);
2404 if (size == 1) {
2405 return get_latin1_char(u[0]);
2406 }
2407
2408 max_char = ucs1lib_find_max_char(u, u + size);
2409 res = PyUnicode_New(size, max_char);
2410 if (!res)
2411 return NULL;
2412 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413 assert(_PyUnicode_CheckConsistency(res, 1));
2414 return res;
2415 }
2416
2417 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419 {
2420 PyObject *res;
2421 Py_UCS2 max_char;
2422
2423 if (size == 0)
2424 _Py_RETURN_UNICODE_EMPTY();
2425 assert(size > 0);
2426 if (size == 1)
2427 return unicode_char(u[0]);
2428
2429 max_char = ucs2lib_find_max_char(u, u + size);
2430 res = PyUnicode_New(size, max_char);
2431 if (!res)
2432 return NULL;
2433 if (max_char >= 256)
2434 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435 else {
2436 _PyUnicode_CONVERT_BYTES(
2437 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438 }
2439 assert(_PyUnicode_CheckConsistency(res, 1));
2440 return res;
2441 }
2442
2443 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445 {
2446 PyObject *res;
2447 Py_UCS4 max_char;
2448
2449 if (size == 0)
2450 _Py_RETURN_UNICODE_EMPTY();
2451 assert(size > 0);
2452 if (size == 1)
2453 return unicode_char(u[0]);
2454
2455 max_char = ucs4lib_find_max_char(u, u + size);
2456 res = PyUnicode_New(size, max_char);
2457 if (!res)
2458 return NULL;
2459 if (max_char < 256)
2460 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461 PyUnicode_1BYTE_DATA(res));
2462 else if (max_char < 0x10000)
2463 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464 PyUnicode_2BYTE_DATA(res));
2465 else
2466 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467 assert(_PyUnicode_CheckConsistency(res, 1));
2468 return res;
2469 }
2470
2471 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473 {
2474 if (size < 0) {
2475 PyErr_SetString(PyExc_ValueError, "size must be positive");
2476 return NULL;
2477 }
2478 switch (kind) {
2479 case PyUnicode_1BYTE_KIND:
2480 return _PyUnicode_FromUCS1(buffer, size);
2481 case PyUnicode_2BYTE_KIND:
2482 return _PyUnicode_FromUCS2(buffer, size);
2483 case PyUnicode_4BYTE_KIND:
2484 return _PyUnicode_FromUCS4(buffer, size);
2485 default:
2486 PyErr_SetString(PyExc_SystemError, "invalid kind");
2487 return NULL;
2488 }
2489 }
2490
2491 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493 {
2494 enum PyUnicode_Kind kind;
2495 const void *startptr, *endptr;
2496
2497 assert(PyUnicode_IS_READY(unicode));
2498 assert(0 <= start);
2499 assert(end <= PyUnicode_GET_LENGTH(unicode));
2500 assert(start <= end);
2501
2502 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503 return PyUnicode_MAX_CHAR_VALUE(unicode);
2504
2505 if (start == end)
2506 return 127;
2507
2508 if (PyUnicode_IS_ASCII(unicode))
2509 return 127;
2510
2511 kind = PyUnicode_KIND(unicode);
2512 startptr = PyUnicode_DATA(unicode);
2513 endptr = (char *)startptr + end * kind;
2514 startptr = (char *)startptr + start * kind;
2515 switch(kind) {
2516 case PyUnicode_1BYTE_KIND:
2517 return ucs1lib_find_max_char(startptr, endptr);
2518 case PyUnicode_2BYTE_KIND:
2519 return ucs2lib_find_max_char(startptr, endptr);
2520 case PyUnicode_4BYTE_KIND:
2521 return ucs4lib_find_max_char(startptr, endptr);
2522 default:
2523 Py_UNREACHABLE();
2524 }
2525 }
2526
2527 /* Ensure that a string uses the most efficient storage, if it is not the
2528 case: create a new string with of the right kind. Write NULL into *p_unicode
2529 on error. */
2530 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2531 unicode_adjust_maxchar(PyObject **p_unicode)
2532 {
2533 PyObject *unicode, *copy;
2534 Py_UCS4 max_char;
2535 Py_ssize_t len;
2536 unsigned int kind;
2537
2538 assert(p_unicode != NULL);
2539 unicode = *p_unicode;
2540 assert(PyUnicode_IS_READY(unicode));
2541 if (PyUnicode_IS_ASCII(unicode))
2542 return;
2543
2544 len = PyUnicode_GET_LENGTH(unicode);
2545 kind = PyUnicode_KIND(unicode);
2546 if (kind == PyUnicode_1BYTE_KIND) {
2547 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548 max_char = ucs1lib_find_max_char(u, u + len);
2549 if (max_char >= 128)
2550 return;
2551 }
2552 else if (kind == PyUnicode_2BYTE_KIND) {
2553 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554 max_char = ucs2lib_find_max_char(u, u + len);
2555 if (max_char >= 256)
2556 return;
2557 }
2558 else if (kind == PyUnicode_4BYTE_KIND) {
2559 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560 max_char = ucs4lib_find_max_char(u, u + len);
2561 if (max_char >= 0x10000)
2562 return;
2563 }
2564 else
2565 Py_UNREACHABLE();
2566
2567 copy = PyUnicode_New(len, max_char);
2568 if (copy != NULL)
2569 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570 Py_DECREF(unicode);
2571 *p_unicode = copy;
2572 }
2573
2574 PyObject*
_PyUnicode_Copy(PyObject * unicode)2575 _PyUnicode_Copy(PyObject *unicode)
2576 {
2577 Py_ssize_t length;
2578 PyObject *copy;
2579
2580 if (!PyUnicode_Check(unicode)) {
2581 PyErr_BadInternalCall();
2582 return NULL;
2583 }
2584 if (PyUnicode_READY(unicode) == -1)
2585 return NULL;
2586
2587 length = PyUnicode_GET_LENGTH(unicode);
2588 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589 if (!copy)
2590 return NULL;
2591 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592
2593 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594 length * PyUnicode_KIND(unicode));
2595 assert(_PyUnicode_CheckConsistency(copy, 1));
2596 return copy;
2597 }
2598
2599
2600 /* Widen Unicode objects to larger buffers. Don't write terminating null
2601 character. Return NULL on error. */
2602
2603 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605 {
2606 void *result;
2607
2608 assert(skind < kind);
2609 switch (kind) {
2610 case PyUnicode_2BYTE_KIND:
2611 result = PyMem_New(Py_UCS2, len);
2612 if (!result)
2613 return PyErr_NoMemory();
2614 assert(skind == PyUnicode_1BYTE_KIND);
2615 _PyUnicode_CONVERT_BYTES(
2616 Py_UCS1, Py_UCS2,
2617 (const Py_UCS1 *)data,
2618 ((const Py_UCS1 *)data) + len,
2619 result);
2620 return result;
2621 case PyUnicode_4BYTE_KIND:
2622 result = PyMem_New(Py_UCS4, len);
2623 if (!result)
2624 return PyErr_NoMemory();
2625 if (skind == PyUnicode_2BYTE_KIND) {
2626 _PyUnicode_CONVERT_BYTES(
2627 Py_UCS2, Py_UCS4,
2628 (const Py_UCS2 *)data,
2629 ((const Py_UCS2 *)data) + len,
2630 result);
2631 }
2632 else {
2633 assert(skind == PyUnicode_1BYTE_KIND);
2634 _PyUnicode_CONVERT_BYTES(
2635 Py_UCS1, Py_UCS4,
2636 (const Py_UCS1 *)data,
2637 ((const Py_UCS1 *)data) + len,
2638 result);
2639 }
2640 return result;
2641 default:
2642 Py_UNREACHABLE();
2643 return NULL;
2644 }
2645 }
2646
2647 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649 int copy_null)
2650 {
2651 int kind;
2652 const void *data;
2653 Py_ssize_t len, targetlen;
2654 if (PyUnicode_READY(string) == -1)
2655 return NULL;
2656 kind = PyUnicode_KIND(string);
2657 data = PyUnicode_DATA(string);
2658 len = PyUnicode_GET_LENGTH(string);
2659 targetlen = len;
2660 if (copy_null)
2661 targetlen++;
2662 if (!target) {
2663 target = PyMem_New(Py_UCS4, targetlen);
2664 if (!target) {
2665 PyErr_NoMemory();
2666 return NULL;
2667 }
2668 }
2669 else {
2670 if (targetsize < targetlen) {
2671 PyErr_Format(PyExc_SystemError,
2672 "string is longer than the buffer");
2673 if (copy_null && 0 < targetsize)
2674 target[0] = 0;
2675 return NULL;
2676 }
2677 }
2678 if (kind == PyUnicode_1BYTE_KIND) {
2679 const Py_UCS1 *start = (const Py_UCS1 *) data;
2680 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681 }
2682 else if (kind == PyUnicode_2BYTE_KIND) {
2683 const Py_UCS2 *start = (const Py_UCS2 *) data;
2684 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685 }
2686 else if (kind == PyUnicode_4BYTE_KIND) {
2687 memcpy(target, data, len * sizeof(Py_UCS4));
2688 }
2689 else {
2690 Py_UNREACHABLE();
2691 }
2692 if (copy_null)
2693 target[len] = 0;
2694 return target;
2695 }
2696
2697 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699 int copy_null)
2700 {
2701 if (target == NULL || targetsize < 0) {
2702 PyErr_BadInternalCall();
2703 return NULL;
2704 }
2705 return as_ucs4(string, target, targetsize, copy_null);
2706 }
2707
2708 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2709 PyUnicode_AsUCS4Copy(PyObject *string)
2710 {
2711 return as_ucs4(string, NULL, 0, 1);
2712 }
2713
2714 /* maximum number of characters required for output of %lld or %p.
2715 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2717 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718
2719 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721 Py_ssize_t width, Py_ssize_t precision)
2722 {
2723 Py_ssize_t length, fill, arglen;
2724 Py_UCS4 maxchar;
2725
2726 if (PyUnicode_READY(str) == -1)
2727 return -1;
2728
2729 length = PyUnicode_GET_LENGTH(str);
2730 if ((precision == -1 || precision >= length)
2731 && width <= length)
2732 return _PyUnicodeWriter_WriteStr(writer, str);
2733
2734 if (precision != -1)
2735 length = Py_MIN(precision, length);
2736
2737 arglen = Py_MAX(length, width);
2738 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740 else
2741 maxchar = writer->maxchar;
2742
2743 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744 return -1;
2745
2746 if (width > length) {
2747 fill = width - length;
2748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749 return -1;
2750 writer->pos += fill;
2751 }
2752
2753 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754 str, 0, length);
2755 writer->pos += length;
2756 return 0;
2757 }
2758
2759 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761 Py_ssize_t width, Py_ssize_t precision)
2762 {
2763 /* UTF-8 */
2764 Py_ssize_t length;
2765 PyObject *unicode;
2766 int res;
2767
2768 if (precision == -1) {
2769 length = strlen(str);
2770 }
2771 else {
2772 length = 0;
2773 while (length < precision && str[length]) {
2774 length++;
2775 }
2776 }
2777 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778 if (unicode == NULL)
2779 return -1;
2780
2781 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782 Py_DECREF(unicode);
2783 return res;
2784 }
2785
2786 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788 const char *f, va_list *vargs)
2789 {
2790 const char *p;
2791 Py_ssize_t len;
2792 int zeropad;
2793 Py_ssize_t width;
2794 Py_ssize_t precision;
2795 int longflag;
2796 int longlongflag;
2797 int size_tflag;
2798 Py_ssize_t fill;
2799
2800 p = f;
2801 f++;
2802 zeropad = 0;
2803 if (*f == '0') {
2804 zeropad = 1;
2805 f++;
2806 }
2807
2808 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809 width = -1;
2810 if (Py_ISDIGIT((unsigned)*f)) {
2811 width = *f - '0';
2812 f++;
2813 while (Py_ISDIGIT((unsigned)*f)) {
2814 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815 PyErr_SetString(PyExc_ValueError,
2816 "width too big");
2817 return NULL;
2818 }
2819 width = (width * 10) + (*f - '0');
2820 f++;
2821 }
2822 }
2823 precision = -1;
2824 if (*f == '.') {
2825 f++;
2826 if (Py_ISDIGIT((unsigned)*f)) {
2827 precision = (*f - '0');
2828 f++;
2829 while (Py_ISDIGIT((unsigned)*f)) {
2830 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831 PyErr_SetString(PyExc_ValueError,
2832 "precision too big");
2833 return NULL;
2834 }
2835 precision = (precision * 10) + (*f - '0');
2836 f++;
2837 }
2838 }
2839 if (*f == '%') {
2840 /* "%.3%s" => f points to "3" */
2841 f--;
2842 }
2843 }
2844 if (*f == '\0') {
2845 /* bogus format "%.123" => go backward, f points to "3" */
2846 f--;
2847 }
2848
2849 /* Handle %ld, %lu, %lld and %llu. */
2850 longflag = 0;
2851 longlongflag = 0;
2852 size_tflag = 0;
2853 if (*f == 'l') {
2854 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855 longflag = 1;
2856 ++f;
2857 }
2858 else if (f[1] == 'l' &&
2859 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860 longlongflag = 1;
2861 f += 2;
2862 }
2863 }
2864 /* handle the size_t flag. */
2865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866 size_tflag = 1;
2867 ++f;
2868 }
2869
2870 if (f[1] == '\0')
2871 writer->overallocate = 0;
2872
2873 switch (*f) {
2874 case 'c':
2875 {
2876 int ordinal = va_arg(*vargs, int);
2877 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878 PyErr_SetString(PyExc_OverflowError,
2879 "character argument not in range(0x110000)");
2880 return NULL;
2881 }
2882 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883 return NULL;
2884 break;
2885 }
2886
2887 case 'i':
2888 case 'd':
2889 case 'u':
2890 case 'x':
2891 {
2892 /* used by sprintf */
2893 char buffer[MAX_LONG_LONG_CHARS];
2894 Py_ssize_t arglen;
2895
2896 if (*f == 'u') {
2897 if (longflag) {
2898 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899 }
2900 else if (longlongflag) {
2901 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902 }
2903 else if (size_tflag) {
2904 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905 }
2906 else {
2907 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908 }
2909 }
2910 else if (*f == 'x') {
2911 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912 }
2913 else {
2914 if (longflag) {
2915 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916 }
2917 else if (longlongflag) {
2918 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919 }
2920 else if (size_tflag) {
2921 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922 }
2923 else {
2924 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925 }
2926 }
2927 assert(len >= 0);
2928
2929 if (precision < len)
2930 precision = len;
2931
2932 arglen = Py_MAX(precision, width);
2933 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934 return NULL;
2935
2936 if (width > precision) {
2937 Py_UCS4 fillchar;
2938 fill = width - precision;
2939 fillchar = zeropad?'0':' ';
2940 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941 return NULL;
2942 writer->pos += fill;
2943 }
2944 if (precision > len) {
2945 fill = precision - len;
2946 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947 return NULL;
2948 writer->pos += fill;
2949 }
2950
2951 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952 return NULL;
2953 break;
2954 }
2955
2956 case 'p':
2957 {
2958 char number[MAX_LONG_LONG_CHARS];
2959
2960 len = sprintf(number, "%p", va_arg(*vargs, void*));
2961 assert(len >= 0);
2962
2963 /* %p is ill-defined: ensure leading 0x. */
2964 if (number[1] == 'X')
2965 number[1] = 'x';
2966 else if (number[1] != 'x') {
2967 memmove(number + 2, number,
2968 strlen(number) + 1);
2969 number[0] = '0';
2970 number[1] = 'x';
2971 len += 2;
2972 }
2973
2974 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975 return NULL;
2976 break;
2977 }
2978
2979 case 's':
2980 {
2981 /* UTF-8 */
2982 const char *s = va_arg(*vargs, const char*);
2983 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984 return NULL;
2985 break;
2986 }
2987
2988 case 'U':
2989 {
2990 PyObject *obj = va_arg(*vargs, PyObject *);
2991 assert(obj && _PyUnicode_CHECK(obj));
2992
2993 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994 return NULL;
2995 break;
2996 }
2997
2998 case 'V':
2999 {
3000 PyObject *obj = va_arg(*vargs, PyObject *);
3001 const char *str = va_arg(*vargs, const char *);
3002 if (obj) {
3003 assert(_PyUnicode_CHECK(obj));
3004 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005 return NULL;
3006 }
3007 else {
3008 assert(str != NULL);
3009 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010 return NULL;
3011 }
3012 break;
3013 }
3014
3015 case 'S':
3016 {
3017 PyObject *obj = va_arg(*vargs, PyObject *);
3018 PyObject *str;
3019 assert(obj);
3020 str = PyObject_Str(obj);
3021 if (!str)
3022 return NULL;
3023 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024 Py_DECREF(str);
3025 return NULL;
3026 }
3027 Py_DECREF(str);
3028 break;
3029 }
3030
3031 case 'R':
3032 {
3033 PyObject *obj = va_arg(*vargs, PyObject *);
3034 PyObject *repr;
3035 assert(obj);
3036 repr = PyObject_Repr(obj);
3037 if (!repr)
3038 return NULL;
3039 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040 Py_DECREF(repr);
3041 return NULL;
3042 }
3043 Py_DECREF(repr);
3044 break;
3045 }
3046
3047 case 'A':
3048 {
3049 PyObject *obj = va_arg(*vargs, PyObject *);
3050 PyObject *ascii;
3051 assert(obj);
3052 ascii = PyObject_ASCII(obj);
3053 if (!ascii)
3054 return NULL;
3055 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056 Py_DECREF(ascii);
3057 return NULL;
3058 }
3059 Py_DECREF(ascii);
3060 break;
3061 }
3062
3063 case '%':
3064 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065 return NULL;
3066 break;
3067
3068 default:
3069 /* if we stumble upon an unknown formatting code, copy the rest
3070 of the format string to the output string. (we cannot just
3071 skip the code, since there's no way to know what's in the
3072 argument list) */
3073 len = strlen(p);
3074 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075 return NULL;
3076 f = p+len;
3077 return f;
3078 }
3079
3080 f++;
3081 return f;
3082 }
3083
3084 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3085 PyUnicode_FromFormatV(const char *format, va_list vargs)
3086 {
3087 va_list vargs2;
3088 const char *f;
3089 _PyUnicodeWriter writer;
3090
3091 _PyUnicodeWriter_Init(&writer);
3092 writer.min_length = strlen(format) + 100;
3093 writer.overallocate = 1;
3094
3095 // Copy varags to be able to pass a reference to a subfunction.
3096 va_copy(vargs2, vargs);
3097
3098 for (f = format; *f; ) {
3099 if (*f == '%') {
3100 f = unicode_fromformat_arg(&writer, f, &vargs2);
3101 if (f == NULL)
3102 goto fail;
3103 }
3104 else {
3105 const char *p;
3106 Py_ssize_t len;
3107
3108 p = f;
3109 do
3110 {
3111 if ((unsigned char)*p > 127) {
3112 PyErr_Format(PyExc_ValueError,
3113 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114 "string, got a non-ASCII byte: 0x%02x",
3115 (unsigned char)*p);
3116 goto fail;
3117 }
3118 p++;
3119 }
3120 while (*p != '\0' && *p != '%');
3121 len = p - f;
3122
3123 if (*p == '\0')
3124 writer.overallocate = 0;
3125
3126 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127 goto fail;
3128
3129 f = p;
3130 }
3131 }
3132 va_end(vargs2);
3133 return _PyUnicodeWriter_Finish(&writer);
3134
3135 fail:
3136 va_end(vargs2);
3137 _PyUnicodeWriter_Dealloc(&writer);
3138 return NULL;
3139 }
3140
3141 PyObject *
PyUnicode_FromFormat(const char * format,...)3142 PyUnicode_FromFormat(const char *format, ...)
3143 {
3144 PyObject* ret;
3145 va_list vargs;
3146
3147 #ifdef HAVE_STDARG_PROTOTYPES
3148 va_start(vargs, format);
3149 #else
3150 va_start(vargs);
3151 #endif
3152 ret = PyUnicode_FromFormatV(format, vargs);
3153 va_end(vargs);
3154 return ret;
3155 }
3156
3157 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3158 unicode_get_widechar_size(PyObject *unicode)
3159 {
3160 Py_ssize_t res;
3161
3162 assert(unicode != NULL);
3163 assert(_PyUnicode_CHECK(unicode));
3164
3165 #if USE_UNICODE_WCHAR_CACHE
3166 if (_PyUnicode_WSTR(unicode) != NULL) {
3167 return PyUnicode_WSTR_LENGTH(unicode);
3168 }
3169 #endif /* USE_UNICODE_WCHAR_CACHE */
3170 assert(PyUnicode_IS_READY(unicode));
3171
3172 res = _PyUnicode_LENGTH(unicode);
3173 #if SIZEOF_WCHAR_T == 2
3174 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176 const Py_UCS4 *end = s + res;
3177 for (; s < end; ++s) {
3178 if (*s > 0xFFFF) {
3179 ++res;
3180 }
3181 }
3182 }
3183 #endif
3184 return res;
3185 }
3186
3187 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189 {
3190 assert(unicode != NULL);
3191 assert(_PyUnicode_CHECK(unicode));
3192
3193 #if USE_UNICODE_WCHAR_CACHE
3194 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195 if (wstr != NULL) {
3196 memcpy(w, wstr, size * sizeof(wchar_t));
3197 return;
3198 }
3199 #else /* USE_UNICODE_WCHAR_CACHE */
3200 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202 return;
3203 }
3204 #endif /* USE_UNICODE_WCHAR_CACHE */
3205 assert(PyUnicode_IS_READY(unicode));
3206
3207 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209 for (; size--; ++s, ++w) {
3210 *w = *s;
3211 }
3212 }
3213 else {
3214 #if SIZEOF_WCHAR_T == 4
3215 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217 for (; size--; ++s, ++w) {
3218 *w = *s;
3219 }
3220 #else
3221 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223 for (; size--; ++s, ++w) {
3224 Py_UCS4 ch = *s;
3225 if (ch > 0xFFFF) {
3226 assert(ch <= MAX_UNICODE);
3227 /* encode surrogate pair in this case */
3228 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229 if (!size--)
3230 break;
3231 *w = Py_UNICODE_LOW_SURROGATE(ch);
3232 }
3233 else {
3234 *w = ch;
3235 }
3236 }
3237 #endif
3238 }
3239 }
3240
3241 #ifdef HAVE_WCHAR_H
3242
3243 /* Convert a Unicode object to a wide character string.
3244
3245 - If w is NULL: return the number of wide characters (including the null
3246 character) required to convert the unicode object. Ignore size argument.
3247
3248 - Otherwise: return the number of wide characters (excluding the null
3249 character) written into w. Write at most size wide characters (including
3250 the null character). */
3251 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252 PyUnicode_AsWideChar(PyObject *unicode,
3253 wchar_t *w,
3254 Py_ssize_t size)
3255 {
3256 Py_ssize_t res;
3257
3258 if (unicode == NULL) {
3259 PyErr_BadInternalCall();
3260 return -1;
3261 }
3262 if (!PyUnicode_Check(unicode)) {
3263 PyErr_BadArgument();
3264 return -1;
3265 }
3266
3267 res = unicode_get_widechar_size(unicode);
3268 if (w == NULL) {
3269 return res + 1;
3270 }
3271
3272 if (size > res) {
3273 size = res + 1;
3274 }
3275 else {
3276 res = size;
3277 }
3278 unicode_copy_as_widechar(unicode, w, size);
3279
3280 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282 non-Unicode locales and hence needs conversion first. */
3283 if (_Py_LocaleUsesNonUnicodeWchar()) {
3284 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285 return -1;
3286 }
3287 }
3288 #endif
3289
3290 return res;
3291 }
3292
3293 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294 PyUnicode_AsWideCharString(PyObject *unicode,
3295 Py_ssize_t *size)
3296 {
3297 wchar_t *buffer;
3298 Py_ssize_t buflen;
3299
3300 if (unicode == NULL) {
3301 PyErr_BadInternalCall();
3302 return NULL;
3303 }
3304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
3306 return NULL;
3307 }
3308
3309 buflen = unicode_get_widechar_size(unicode);
3310 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311 if (buffer == NULL) {
3312 PyErr_NoMemory();
3313 return NULL;
3314 }
3315 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316
3317 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319 non-Unicode locales and hence needs conversion first. */
3320 if (_Py_LocaleUsesNonUnicodeWchar()) {
3321 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322 return NULL;
3323 }
3324 }
3325 #endif
3326
3327 if (size != NULL) {
3328 *size = buflen;
3329 }
3330 else if (wcslen(buffer) != (size_t)buflen) {
3331 PyMem_Free(buffer);
3332 PyErr_SetString(PyExc_ValueError,
3333 "embedded null character");
3334 return NULL;
3335 }
3336 return buffer;
3337 }
3338
3339 #endif /* HAVE_WCHAR_H */
3340
3341 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343 {
3344 wchar_t **p = (wchar_t **)ptr;
3345 if (obj == NULL) {
3346 #if !USE_UNICODE_WCHAR_CACHE
3347 PyMem_Free(*p);
3348 #endif /* USE_UNICODE_WCHAR_CACHE */
3349 *p = NULL;
3350 return 1;
3351 }
3352 if (PyUnicode_Check(obj)) {
3353 #if USE_UNICODE_WCHAR_CACHE
3354 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355 if (*p == NULL) {
3356 return 0;
3357 }
3358 return 1;
3359 #else /* USE_UNICODE_WCHAR_CACHE */
3360 *p = PyUnicode_AsWideCharString(obj, NULL);
3361 if (*p == NULL) {
3362 return 0;
3363 }
3364 return Py_CLEANUP_SUPPORTED;
3365 #endif /* USE_UNICODE_WCHAR_CACHE */
3366 }
3367 PyErr_Format(PyExc_TypeError,
3368 "argument must be str, not %.50s",
3369 Py_TYPE(obj)->tp_name);
3370 return 0;
3371 }
3372
3373 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375 {
3376 wchar_t **p = (wchar_t **)ptr;
3377 if (obj == NULL) {
3378 #if !USE_UNICODE_WCHAR_CACHE
3379 PyMem_Free(*p);
3380 #endif /* USE_UNICODE_WCHAR_CACHE */
3381 *p = NULL;
3382 return 1;
3383 }
3384 if (obj == Py_None) {
3385 *p = NULL;
3386 return 1;
3387 }
3388 if (PyUnicode_Check(obj)) {
3389 #if USE_UNICODE_WCHAR_CACHE
3390 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391 if (*p == NULL) {
3392 return 0;
3393 }
3394 return 1;
3395 #else /* USE_UNICODE_WCHAR_CACHE */
3396 *p = PyUnicode_AsWideCharString(obj, NULL);
3397 if (*p == NULL) {
3398 return 0;
3399 }
3400 return Py_CLEANUP_SUPPORTED;
3401 #endif /* USE_UNICODE_WCHAR_CACHE */
3402 }
3403 PyErr_Format(PyExc_TypeError,
3404 "argument must be str or None, not %.50s",
3405 Py_TYPE(obj)->tp_name);
3406 return 0;
3407 }
3408
3409 PyObject *
PyUnicode_FromOrdinal(int ordinal)3410 PyUnicode_FromOrdinal(int ordinal)
3411 {
3412 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413 PyErr_SetString(PyExc_ValueError,
3414 "chr() arg not in range(0x110000)");
3415 return NULL;
3416 }
3417
3418 return unicode_char((Py_UCS4)ordinal);
3419 }
3420
3421 PyObject *
PyUnicode_FromObject(PyObject * obj)3422 PyUnicode_FromObject(PyObject *obj)
3423 {
3424 /* XXX Perhaps we should make this API an alias of
3425 PyObject_Str() instead ?! */
3426 if (PyUnicode_CheckExact(obj)) {
3427 if (PyUnicode_READY(obj) == -1)
3428 return NULL;
3429 Py_INCREF(obj);
3430 return obj;
3431 }
3432 if (PyUnicode_Check(obj)) {
3433 /* For a Unicode subtype that's not a Unicode object,
3434 return a true Unicode object with the same data. */
3435 return _PyUnicode_Copy(obj);
3436 }
3437 PyErr_Format(PyExc_TypeError,
3438 "Can't convert '%.100s' object to str implicitly",
3439 Py_TYPE(obj)->tp_name);
3440 return NULL;
3441 }
3442
3443 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444 PyUnicode_FromEncodedObject(PyObject *obj,
3445 const char *encoding,
3446 const char *errors)
3447 {
3448 Py_buffer buffer;
3449 PyObject *v;
3450
3451 if (obj == NULL) {
3452 PyErr_BadInternalCall();
3453 return NULL;
3454 }
3455
3456 /* Decoding bytes objects is the most common case and should be fast */
3457 if (PyBytes_Check(obj)) {
3458 if (PyBytes_GET_SIZE(obj) == 0) {
3459 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460 return NULL;
3461 }
3462 _Py_RETURN_UNICODE_EMPTY();
3463 }
3464 return PyUnicode_Decode(
3465 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466 encoding, errors);
3467 }
3468
3469 if (PyUnicode_Check(obj)) {
3470 PyErr_SetString(PyExc_TypeError,
3471 "decoding str is not supported");
3472 return NULL;
3473 }
3474
3475 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477 PyErr_Format(PyExc_TypeError,
3478 "decoding to str: need a bytes-like object, %.80s found",
3479 Py_TYPE(obj)->tp_name);
3480 return NULL;
3481 }
3482
3483 if (buffer.len == 0) {
3484 PyBuffer_Release(&buffer);
3485 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486 return NULL;
3487 }
3488 _Py_RETURN_UNICODE_EMPTY();
3489 }
3490
3491 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492 PyBuffer_Release(&buffer);
3493 return v;
3494 }
3495
3496 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498 longer than lower_len-1). */
3499 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500 _Py_normalize_encoding(const char *encoding,
3501 char *lower,
3502 size_t lower_len)
3503 {
3504 const char *e;
3505 char *l;
3506 char *l_end;
3507 int punct;
3508
3509 assert(encoding != NULL);
3510
3511 e = encoding;
3512 l = lower;
3513 l_end = &lower[lower_len - 1];
3514 punct = 0;
3515 while (1) {
3516 char c = *e;
3517 if (c == 0) {
3518 break;
3519 }
3520
3521 if (Py_ISALNUM(c) || c == '.') {
3522 if (punct && l != lower) {
3523 if (l == l_end) {
3524 return 0;
3525 }
3526 *l++ = '_';
3527 }
3528 punct = 0;
3529
3530 if (l == l_end) {
3531 return 0;
3532 }
3533 *l++ = Py_TOLOWER(c);
3534 }
3535 else {
3536 punct = 1;
3537 }
3538
3539 e++;
3540 }
3541 *l = '\0';
3542 return 1;
3543 }
3544
3545 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546 PyUnicode_Decode(const char *s,
3547 Py_ssize_t size,
3548 const char *encoding,
3549 const char *errors)
3550 {
3551 PyObject *buffer = NULL, *unicode;
3552 Py_buffer info;
3553 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554
3555 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556 return NULL;
3557 }
3558
3559 if (size == 0) {
3560 _Py_RETURN_UNICODE_EMPTY();
3561 }
3562
3563 if (encoding == NULL) {
3564 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565 }
3566
3567 /* Shortcuts for common default encodings */
3568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569 char *lower = buflower;
3570
3571 /* Fast paths */
3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573 lower += 3;
3574 if (*lower == '_') {
3575 /* Match "utf8" and "utf_8" */
3576 lower++;
3577 }
3578
3579 if (lower[0] == '8' && lower[1] == 0) {
3580 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581 }
3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584 }
3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587 }
3588 }
3589 else {
3590 if (strcmp(lower, "ascii") == 0
3591 || strcmp(lower, "us_ascii") == 0) {
3592 return PyUnicode_DecodeASCII(s, size, errors);
3593 }
3594 #ifdef MS_WINDOWS
3595 else if (strcmp(lower, "mbcs") == 0) {
3596 return PyUnicode_DecodeMBCS(s, size, errors);
3597 }
3598 #endif
3599 else if (strcmp(lower, "latin1") == 0
3600 || strcmp(lower, "latin_1") == 0
3601 || strcmp(lower, "iso_8859_1") == 0
3602 || strcmp(lower, "iso8859_1") == 0) {
3603 return PyUnicode_DecodeLatin1(s, size, errors);
3604 }
3605 }
3606 }
3607
3608 /* Decode via the codec registry */
3609 buffer = NULL;
3610 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611 goto onError;
3612 buffer = PyMemoryView_FromBuffer(&info);
3613 if (buffer == NULL)
3614 goto onError;
3615 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616 if (unicode == NULL)
3617 goto onError;
3618 if (!PyUnicode_Check(unicode)) {
3619 PyErr_Format(PyExc_TypeError,
3620 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621 "use codecs.decode() to decode to arbitrary types",
3622 encoding,
3623 Py_TYPE(unicode)->tp_name);
3624 Py_DECREF(unicode);
3625 goto onError;
3626 }
3627 Py_DECREF(buffer);
3628 return unicode_result(unicode);
3629
3630 onError:
3631 Py_XDECREF(buffer);
3632 return NULL;
3633 }
3634
3635 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636 PyUnicode_AsDecodedObject(PyObject *unicode,
3637 const char *encoding,
3638 const char *errors)
3639 {
3640 if (!PyUnicode_Check(unicode)) {
3641 PyErr_BadArgument();
3642 return NULL;
3643 }
3644
3645 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646 "PyUnicode_AsDecodedObject() is deprecated; "
3647 "use PyCodec_Decode() to decode from str", 1) < 0)
3648 return NULL;
3649
3650 if (encoding == NULL)
3651 encoding = PyUnicode_GetDefaultEncoding();
3652
3653 /* Decode via the codec registry */
3654 return PyCodec_Decode(unicode, encoding, errors);
3655 }
3656
3657 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659 const char *encoding,
3660 const char *errors)
3661 {
3662 PyObject *v;
3663
3664 if (!PyUnicode_Check(unicode)) {
3665 PyErr_BadArgument();
3666 goto onError;
3667 }
3668
3669 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670 "PyUnicode_AsDecodedUnicode() is deprecated; "
3671 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672 return NULL;
3673
3674 if (encoding == NULL)
3675 encoding = PyUnicode_GetDefaultEncoding();
3676
3677 /* Decode via the codec registry */
3678 v = PyCodec_Decode(unicode, encoding, errors);
3679 if (v == NULL)
3680 goto onError;
3681 if (!PyUnicode_Check(v)) {
3682 PyErr_Format(PyExc_TypeError,
3683 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684 "use codecs.decode() to decode to arbitrary types",
3685 encoding,
3686 Py_TYPE(unicode)->tp_name);
3687 Py_DECREF(v);
3688 goto onError;
3689 }
3690 return unicode_result(v);
3691
3692 onError:
3693 return NULL;
3694 }
3695
3696 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697 PyUnicode_AsEncodedObject(PyObject *unicode,
3698 const char *encoding,
3699 const char *errors)
3700 {
3701 PyObject *v;
3702
3703 if (!PyUnicode_Check(unicode)) {
3704 PyErr_BadArgument();
3705 goto onError;
3706 }
3707
3708 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709 "PyUnicode_AsEncodedObject() is deprecated; "
3710 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711 "or PyCodec_Encode() for generic encoding", 1) < 0)
3712 return NULL;
3713
3714 if (encoding == NULL)
3715 encoding = PyUnicode_GetDefaultEncoding();
3716
3717 /* Encode via the codec registry */
3718 v = PyCodec_Encode(unicode, encoding, errors);
3719 if (v == NULL)
3720 goto onError;
3721 return v;
3722
3723 onError:
3724 return NULL;
3725 }
3726
3727
3728 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730 int current_locale)
3731 {
3732 Py_ssize_t wlen;
3733 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734 if (wstr == NULL) {
3735 return NULL;
3736 }
3737
3738 if ((size_t)wlen != wcslen(wstr)) {
3739 PyErr_SetString(PyExc_ValueError, "embedded null character");
3740 PyMem_Free(wstr);
3741 return NULL;
3742 }
3743
3744 char *str;
3745 size_t error_pos;
3746 const char *reason;
3747 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748 current_locale, error_handler);
3749 PyMem_Free(wstr);
3750
3751 if (res != 0) {
3752 if (res == -2) {
3753 PyObject *exc;
3754 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755 "locale", unicode,
3756 (Py_ssize_t)error_pos,
3757 (Py_ssize_t)(error_pos+1),
3758 reason);
3759 if (exc != NULL) {
3760 PyCodec_StrictErrors(exc);
3761 Py_DECREF(exc);
3762 }
3763 }
3764 else if (res == -3) {
3765 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766 }
3767 else {
3768 PyErr_NoMemory();
3769 }
3770 return NULL;
3771 }
3772
3773 PyObject *bytes = PyBytes_FromString(str);
3774 PyMem_RawFree(str);
3775 return bytes;
3776 }
3777
3778 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780 {
3781 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782 return unicode_encode_locale(unicode, error_handler, 1);
3783 }
3784
3785 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3786 PyUnicode_EncodeFSDefault(PyObject *unicode)
3787 {
3788 PyInterpreterState *interp = _PyInterpreterState_GET();
3789 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790 if (fs_codec->utf8) {
3791 return unicode_encode_utf8(unicode,
3792 fs_codec->error_handler,
3793 fs_codec->errors);
3794 }
3795 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3796 else if (fs_codec->encoding) {
3797 return PyUnicode_AsEncodedString(unicode,
3798 fs_codec->encoding,
3799 fs_codec->errors);
3800 }
3801 #endif
3802 else {
3803 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804 machinery is not ready and so cannot be used:
3805 use wcstombs() in this case. */
3806 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807 const wchar_t *filesystem_errors = config->filesystem_errors;
3808 assert(filesystem_errors != NULL);
3809 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810 assert(errors != _Py_ERROR_UNKNOWN);
3811 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3812 return unicode_encode_utf8(unicode, errors, NULL);
3813 #else
3814 return unicode_encode_locale(unicode, errors, 0);
3815 #endif
3816 }
3817 }
3818
3819 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820 PyUnicode_AsEncodedString(PyObject *unicode,
3821 const char *encoding,
3822 const char *errors)
3823 {
3824 PyObject *v;
3825 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826
3827 if (!PyUnicode_Check(unicode)) {
3828 PyErr_BadArgument();
3829 return NULL;
3830 }
3831
3832 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833 return NULL;
3834 }
3835
3836 if (encoding == NULL) {
3837 return _PyUnicode_AsUTF8String(unicode, errors);
3838 }
3839
3840 /* Shortcuts for common default encodings */
3841 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842 char *lower = buflower;
3843
3844 /* Fast paths */
3845 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846 lower += 3;
3847 if (*lower == '_') {
3848 /* Match "utf8" and "utf_8" */
3849 lower++;
3850 }
3851
3852 if (lower[0] == '8' && lower[1] == 0) {
3853 return _PyUnicode_AsUTF8String(unicode, errors);
3854 }
3855 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857 }
3858 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860 }
3861 }
3862 else {
3863 if (strcmp(lower, "ascii") == 0
3864 || strcmp(lower, "us_ascii") == 0) {
3865 return _PyUnicode_AsASCIIString(unicode, errors);
3866 }
3867 #ifdef MS_WINDOWS
3868 else if (strcmp(lower, "mbcs") == 0) {
3869 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870 }
3871 #endif
3872 else if (strcmp(lower, "latin1") == 0 ||
3873 strcmp(lower, "latin_1") == 0 ||
3874 strcmp(lower, "iso_8859_1") == 0 ||
3875 strcmp(lower, "iso8859_1") == 0) {
3876 return _PyUnicode_AsLatin1String(unicode, errors);
3877 }
3878 }
3879 }
3880
3881 /* Encode via the codec registry */
3882 v = _PyCodec_EncodeText(unicode, encoding, errors);
3883 if (v == NULL)
3884 return NULL;
3885
3886 /* The normal path */
3887 if (PyBytes_Check(v))
3888 return v;
3889
3890 /* If the codec returns a buffer, raise a warning and convert to bytes */
3891 if (PyByteArray_Check(v)) {
3892 int error;
3893 PyObject *b;
3894
3895 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896 "encoder %s returned bytearray instead of bytes; "
3897 "use codecs.encode() to encode to arbitrary types",
3898 encoding);
3899 if (error) {
3900 Py_DECREF(v);
3901 return NULL;
3902 }
3903
3904 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905 PyByteArray_GET_SIZE(v));
3906 Py_DECREF(v);
3907 return b;
3908 }
3909
3910 PyErr_Format(PyExc_TypeError,
3911 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912 "use codecs.encode() to encode to arbitrary types",
3913 encoding,
3914 Py_TYPE(v)->tp_name);
3915 Py_DECREF(v);
3916 return NULL;
3917 }
3918
3919 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921 const char *encoding,
3922 const char *errors)
3923 {
3924 PyObject *v;
3925
3926 if (!PyUnicode_Check(unicode)) {
3927 PyErr_BadArgument();
3928 goto onError;
3929 }
3930
3931 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932 "PyUnicode_AsEncodedUnicode() is deprecated; "
3933 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934 return NULL;
3935
3936 if (encoding == NULL)
3937 encoding = PyUnicode_GetDefaultEncoding();
3938
3939 /* Encode via the codec registry */
3940 v = PyCodec_Encode(unicode, encoding, errors);
3941 if (v == NULL)
3942 goto onError;
3943 if (!PyUnicode_Check(v)) {
3944 PyErr_Format(PyExc_TypeError,
3945 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946 "use codecs.encode() to encode to arbitrary types",
3947 encoding,
3948 Py_TYPE(v)->tp_name);
3949 Py_DECREF(v);
3950 goto onError;
3951 }
3952 return v;
3953
3954 onError:
3955 return NULL;
3956 }
3957
3958 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959 unicode_decode_locale(const char *str, Py_ssize_t len,
3960 _Py_error_handler errors, int current_locale)
3961 {
3962 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3963 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964 return NULL;
3965 }
3966
3967 wchar_t *wstr;
3968 size_t wlen;
3969 const char *reason;
3970 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971 current_locale, errors);
3972 if (res != 0) {
3973 if (res == -2) {
3974 PyObject *exc;
3975 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976 "locale", str, len,
3977 (Py_ssize_t)wlen,
3978 (Py_ssize_t)(wlen + 1),
3979 reason);
3980 if (exc != NULL) {
3981 PyCodec_StrictErrors(exc);
3982 Py_DECREF(exc);
3983 }
3984 }
3985 else if (res == -3) {
3986 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987 }
3988 else {
3989 PyErr_NoMemory();
3990 }
3991 return NULL;
3992 }
3993
3994 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995 PyMem_RawFree(wstr);
3996 return unicode;
3997 }
3998
3999 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001 const char *errors)
4002 {
4003 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004 return unicode_decode_locale(str, len, error_handler, 1);
4005 }
4006
4007 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4008 PyUnicode_DecodeLocale(const char *str, const char *errors)
4009 {
4010 Py_ssize_t size = (Py_ssize_t)strlen(str);
4011 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012 return unicode_decode_locale(str, size, error_handler, 1);
4013 }
4014
4015
4016 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4017 PyUnicode_DecodeFSDefault(const char *s) {
4018 Py_ssize_t size = (Py_ssize_t)strlen(s);
4019 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020 }
4021
4022 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024 {
4025 PyInterpreterState *interp = _PyInterpreterState_GET();
4026 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027 if (fs_codec->utf8) {
4028 return unicode_decode_utf8(s, size,
4029 fs_codec->error_handler,
4030 fs_codec->errors,
4031 NULL);
4032 }
4033 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4034 else if (fs_codec->encoding) {
4035 return PyUnicode_Decode(s, size,
4036 fs_codec->encoding,
4037 fs_codec->errors);
4038 }
4039 #endif
4040 else {
4041 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042 machinery is not ready and so cannot be used:
4043 use mbstowcs() in this case. */
4044 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045 const wchar_t *filesystem_errors = config->filesystem_errors;
4046 assert(filesystem_errors != NULL);
4047 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048 assert(errors != _Py_ERROR_UNKNOWN);
4049 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4050 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051 #else
4052 return unicode_decode_locale(s, size, errors, 0);
4053 #endif
4054 }
4055 }
4056
4057
4058 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4059 PyUnicode_FSConverter(PyObject* arg, void* addr)
4060 {
4061 PyObject *path = NULL;
4062 PyObject *output = NULL;
4063 Py_ssize_t size;
4064 const char *data;
4065 if (arg == NULL) {
4066 Py_DECREF(*(PyObject**)addr);
4067 *(PyObject**)addr = NULL;
4068 return 1;
4069 }
4070 path = PyOS_FSPath(arg);
4071 if (path == NULL) {
4072 return 0;
4073 }
4074 if (PyBytes_Check(path)) {
4075 output = path;
4076 }
4077 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4078 output = PyUnicode_EncodeFSDefault(path);
4079 Py_DECREF(path);
4080 if (!output) {
4081 return 0;
4082 }
4083 assert(PyBytes_Check(output));
4084 }
4085
4086 size = PyBytes_GET_SIZE(output);
4087 data = PyBytes_AS_STRING(output);
4088 if ((size_t)size != strlen(data)) {
4089 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090 Py_DECREF(output);
4091 return 0;
4092 }
4093 *(PyObject**)addr = output;
4094 return Py_CLEANUP_SUPPORTED;
4095 }
4096
4097
4098 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4099 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100 {
4101 int is_buffer = 0;
4102 PyObject *path = NULL;
4103 PyObject *output = NULL;
4104 if (arg == NULL) {
4105 Py_DECREF(*(PyObject**)addr);
4106 *(PyObject**)addr = NULL;
4107 return 1;
4108 }
4109
4110 is_buffer = PyObject_CheckBuffer(arg);
4111 if (!is_buffer) {
4112 path = PyOS_FSPath(arg);
4113 if (path == NULL) {
4114 return 0;
4115 }
4116 }
4117 else {
4118 path = arg;
4119 Py_INCREF(arg);
4120 }
4121
4122 if (PyUnicode_Check(path)) {
4123 output = path;
4124 }
4125 else if (PyBytes_Check(path) || is_buffer) {
4126 PyObject *path_bytes = NULL;
4127
4128 if (!PyBytes_Check(path) &&
4129 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130 "path should be string, bytes, or os.PathLike, not %.200s",
4131 Py_TYPE(arg)->tp_name)) {
4132 Py_DECREF(path);
4133 return 0;
4134 }
4135 path_bytes = PyBytes_FromObject(path);
4136 Py_DECREF(path);
4137 if (!path_bytes) {
4138 return 0;
4139 }
4140 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141 PyBytes_GET_SIZE(path_bytes));
4142 Py_DECREF(path_bytes);
4143 if (!output) {
4144 return 0;
4145 }
4146 }
4147 else {
4148 PyErr_Format(PyExc_TypeError,
4149 "path should be string, bytes, or os.PathLike, not %.200s",
4150 Py_TYPE(arg)->tp_name);
4151 Py_DECREF(path);
4152 return 0;
4153 }
4154 if (PyUnicode_READY(output) == -1) {
4155 Py_DECREF(output);
4156 return 0;
4157 }
4158 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160 PyErr_SetString(PyExc_ValueError, "embedded null character");
4161 Py_DECREF(output);
4162 return 0;
4163 }
4164 *(PyObject**)addr = output;
4165 return Py_CLEANUP_SUPPORTED;
4166 }
4167
4168
4169 static int unicode_fill_utf8(PyObject *unicode);
4170
4171 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173 {
4174 if (!PyUnicode_Check(unicode)) {
4175 PyErr_BadArgument();
4176 return NULL;
4177 }
4178 if (PyUnicode_READY(unicode) == -1)
4179 return NULL;
4180
4181 if (PyUnicode_UTF8(unicode) == NULL) {
4182 if (unicode_fill_utf8(unicode) == -1) {
4183 return NULL;
4184 }
4185 }
4186
4187 if (psize)
4188 *psize = PyUnicode_UTF8_LENGTH(unicode);
4189 return PyUnicode_UTF8(unicode);
4190 }
4191
4192 const char *
PyUnicode_AsUTF8(PyObject * unicode)4193 PyUnicode_AsUTF8(PyObject *unicode)
4194 {
4195 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196 }
4197
4198 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200 {
4201 if (!PyUnicode_Check(unicode)) {
4202 PyErr_BadArgument();
4203 return NULL;
4204 }
4205 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206 if (w == NULL) {
4207 /* Non-ASCII compact unicode object */
4208 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209 assert(PyUnicode_IS_READY(unicode));
4210
4211 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213 PyErr_NoMemory();
4214 return NULL;
4215 }
4216 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217 if (w == NULL) {
4218 PyErr_NoMemory();
4219 return NULL;
4220 }
4221 unicode_copy_as_widechar(unicode, w, wlen + 1);
4222 _PyUnicode_WSTR(unicode) = w;
4223 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225 }
4226 }
4227 if (size != NULL)
4228 *size = PyUnicode_WSTR_LENGTH(unicode);
4229 return w;
4230 }
4231
4232 /* Deprecated APIs */
4233
4234 _Py_COMP_DIAG_PUSH
4235 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236
4237 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4238 PyUnicode_AsUnicode(PyObject *unicode)
4239 {
4240 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241 }
4242
4243 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4244 _PyUnicode_AsUnicode(PyObject *unicode)
4245 {
4246 Py_ssize_t size;
4247 const Py_UNICODE *wstr;
4248
4249 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250 if (wstr && wcslen(wstr) != (size_t)size) {
4251 PyErr_SetString(PyExc_ValueError, "embedded null character");
4252 return NULL;
4253 }
4254 return wstr;
4255 }
4256
4257
4258 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4259 PyUnicode_GetSize(PyObject *unicode)
4260 {
4261 if (!PyUnicode_Check(unicode)) {
4262 PyErr_BadArgument();
4263 goto onError;
4264 }
4265 if (_PyUnicode_WSTR(unicode) == NULL) {
4266 if (PyUnicode_AsUnicode(unicode) == NULL)
4267 goto onError;
4268 }
4269 return PyUnicode_WSTR_LENGTH(unicode);
4270
4271 onError:
4272 return -1;
4273 }
4274
4275 _Py_COMP_DIAG_POP
4276
4277 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4278 PyUnicode_GetLength(PyObject *unicode)
4279 {
4280 if (!PyUnicode_Check(unicode)) {
4281 PyErr_BadArgument();
4282 return -1;
4283 }
4284 if (PyUnicode_READY(unicode) == -1)
4285 return -1;
4286 return PyUnicode_GET_LENGTH(unicode);
4287 }
4288
4289 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291 {
4292 const void *data;
4293 int kind;
4294
4295 if (!PyUnicode_Check(unicode)) {
4296 PyErr_BadArgument();
4297 return (Py_UCS4)-1;
4298 }
4299 if (PyUnicode_READY(unicode) == -1) {
4300 return (Py_UCS4)-1;
4301 }
4302 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303 PyErr_SetString(PyExc_IndexError, "string index out of range");
4304 return (Py_UCS4)-1;
4305 }
4306 data = PyUnicode_DATA(unicode);
4307 kind = PyUnicode_KIND(unicode);
4308 return PyUnicode_READ(kind, data, index);
4309 }
4310
4311 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313 {
4314 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315 PyErr_BadArgument();
4316 return -1;
4317 }
4318 assert(PyUnicode_IS_READY(unicode));
4319 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320 PyErr_SetString(PyExc_IndexError, "string index out of range");
4321 return -1;
4322 }
4323 if (unicode_check_modifiable(unicode))
4324 return -1;
4325 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326 PyErr_SetString(PyExc_ValueError, "character out of range");
4327 return -1;
4328 }
4329 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330 index, ch);
4331 return 0;
4332 }
4333
4334 const char *
PyUnicode_GetDefaultEncoding(void)4335 PyUnicode_GetDefaultEncoding(void)
4336 {
4337 return "utf-8";
4338 }
4339
4340 /* create or adjust a UnicodeDecodeError */
4341 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342 make_decode_exception(PyObject **exceptionObject,
4343 const char *encoding,
4344 const char *input, Py_ssize_t length,
4345 Py_ssize_t startpos, Py_ssize_t endpos,
4346 const char *reason)
4347 {
4348 if (*exceptionObject == NULL) {
4349 *exceptionObject = PyUnicodeDecodeError_Create(
4350 encoding, input, length, startpos, endpos, reason);
4351 }
4352 else {
4353 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354 goto onError;
4355 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356 goto onError;
4357 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358 goto onError;
4359 }
4360 return;
4361
4362 onError:
4363 Py_CLEAR(*exceptionObject);
4364 }
4365
4366 #ifdef MS_WINDOWS
4367 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369 {
4370 if (newsize > *size) {
4371 wchar_t *newbuf = *buf;
4372 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373 PyErr_NoMemory();
4374 return -1;
4375 }
4376 *buf = newbuf;
4377 }
4378 *size = newsize;
4379 return 0;
4380 }
4381
4382 /* error handling callback helper:
4383 build arguments, call the callback and check the arguments,
4384 if no exception occurred, copy the replacement to the output
4385 and adjust various state variables.
4386 return 0 on success, -1 on error
4387 */
4388
4389 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390 unicode_decode_call_errorhandler_wchar(
4391 const char *errors, PyObject **errorHandler,
4392 const char *encoding, const char *reason,
4393 const char **input, const char **inend, Py_ssize_t *startinpos,
4394 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396 {
4397 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398
4399 PyObject *restuple = NULL;
4400 PyObject *repunicode = NULL;
4401 Py_ssize_t outsize;
4402 Py_ssize_t insize;
4403 Py_ssize_t requiredsize;
4404 Py_ssize_t newpos;
4405 PyObject *inputobj = NULL;
4406 Py_ssize_t repwlen;
4407
4408 if (*errorHandler == NULL) {
4409 *errorHandler = PyCodec_LookupError(errors);
4410 if (*errorHandler == NULL)
4411 goto onError;
4412 }
4413
4414 make_decode_exception(exceptionObject,
4415 encoding,
4416 *input, *inend - *input,
4417 *startinpos, *endinpos,
4418 reason);
4419 if (*exceptionObject == NULL)
4420 goto onError;
4421
4422 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423 if (restuple == NULL)
4424 goto onError;
4425 if (!PyTuple_Check(restuple)) {
4426 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427 goto onError;
4428 }
4429 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430 goto onError;
4431
4432 /* Copy back the bytes variables, which might have been modified by the
4433 callback */
4434 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435 if (!inputobj)
4436 goto onError;
4437 *input = PyBytes_AS_STRING(inputobj);
4438 insize = PyBytes_GET_SIZE(inputobj);
4439 *inend = *input + insize;
4440 /* we can DECREF safely, as the exception has another reference,
4441 so the object won't go away. */
4442 Py_DECREF(inputobj);
4443
4444 if (newpos<0)
4445 newpos = insize+newpos;
4446 if (newpos<0 || newpos>insize) {
4447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448 goto onError;
4449 }
4450
4451 #if USE_UNICODE_WCHAR_CACHE
4452 _Py_COMP_DIAG_PUSH
4453 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454 repwlen = PyUnicode_GetSize(repunicode);
4455 if (repwlen < 0)
4456 goto onError;
4457 _Py_COMP_DIAG_POP
4458 #else /* USE_UNICODE_WCHAR_CACHE */
4459 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460 if (repwlen < 0)
4461 goto onError;
4462 repwlen--;
4463 #endif /* USE_UNICODE_WCHAR_CACHE */
4464 /* need more space? (at least enough for what we
4465 have+the replacement+the rest of the string (starting
4466 at the new input position), so we won't have to check space
4467 when there are no errors in the rest of the string) */
4468 requiredsize = *outpos;
4469 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470 goto overflow;
4471 requiredsize += repwlen;
4472 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473 goto overflow;
4474 requiredsize += insize - newpos;
4475 outsize = *bufsize;
4476 if (requiredsize > outsize) {
4477 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478 requiredsize = 2*outsize;
4479 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480 goto onError;
4481 }
4482 }
4483 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484 *outpos += repwlen;
4485 *endinpos = newpos;
4486 *inptr = *input + newpos;
4487
4488 /* we made it! */
4489 Py_DECREF(restuple);
4490 return 0;
4491
4492 overflow:
4493 PyErr_SetString(PyExc_OverflowError,
4494 "decoded result is too long for a Python string");
4495
4496 onError:
4497 Py_XDECREF(restuple);
4498 return -1;
4499 }
4500 #endif /* MS_WINDOWS */
4501
4502 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503 unicode_decode_call_errorhandler_writer(
4504 const char *errors, PyObject **errorHandler,
4505 const char *encoding, const char *reason,
4506 const char **input, const char **inend, Py_ssize_t *startinpos,
4507 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509 {
4510 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511
4512 PyObject *restuple = NULL;
4513 PyObject *repunicode = NULL;
4514 Py_ssize_t insize;
4515 Py_ssize_t newpos;
4516 Py_ssize_t replen;
4517 Py_ssize_t remain;
4518 PyObject *inputobj = NULL;
4519 int need_to_grow = 0;
4520 const char *new_inptr;
4521
4522 if (*errorHandler == NULL) {
4523 *errorHandler = PyCodec_LookupError(errors);
4524 if (*errorHandler == NULL)
4525 goto onError;
4526 }
4527
4528 make_decode_exception(exceptionObject,
4529 encoding,
4530 *input, *inend - *input,
4531 *startinpos, *endinpos,
4532 reason);
4533 if (*exceptionObject == NULL)
4534 goto onError;
4535
4536 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537 if (restuple == NULL)
4538 goto onError;
4539 if (!PyTuple_Check(restuple)) {
4540 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541 goto onError;
4542 }
4543 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544 goto onError;
4545
4546 /* Copy back the bytes variables, which might have been modified by the
4547 callback */
4548 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549 if (!inputobj)
4550 goto onError;
4551 remain = *inend - *input - *endinpos;
4552 *input = PyBytes_AS_STRING(inputobj);
4553 insize = PyBytes_GET_SIZE(inputobj);
4554 *inend = *input + insize;
4555 /* we can DECREF safely, as the exception has another reference,
4556 so the object won't go away. */
4557 Py_DECREF(inputobj);
4558
4559 if (newpos<0)
4560 newpos = insize+newpos;
4561 if (newpos<0 || newpos>insize) {
4562 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563 goto onError;
4564 }
4565
4566 replen = PyUnicode_GET_LENGTH(repunicode);
4567 if (replen > 1) {
4568 writer->min_length += replen - 1;
4569 need_to_grow = 1;
4570 }
4571 new_inptr = *input + newpos;
4572 if (*inend - new_inptr > remain) {
4573 /* We don't know the decoding algorithm here so we make the worst
4574 assumption that one byte decodes to one unicode character.
4575 If unfortunately one byte could decode to more unicode characters,
4576 the decoder may write out-of-bound then. Is it possible for the
4577 algorithms using this function? */
4578 writer->min_length += *inend - new_inptr - remain;
4579 need_to_grow = 1;
4580 }
4581 if (need_to_grow) {
4582 writer->overallocate = 1;
4583 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585 goto onError;
4586 }
4587 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588 goto onError;
4589
4590 *endinpos = newpos;
4591 *inptr = new_inptr;
4592
4593 /* we made it! */
4594 Py_DECREF(restuple);
4595 return 0;
4596
4597 onError:
4598 Py_XDECREF(restuple);
4599 return -1;
4600 }
4601
4602 /* --- UTF-7 Codec -------------------------------------------------------- */
4603
4604 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4605
4606 /* Three simple macros defining base-64. */
4607
4608 /* Is c a base-64 character? */
4609
4610 #define IS_BASE64(c) \
4611 (((c) >= 'A' && (c) <= 'Z') || \
4612 ((c) >= 'a' && (c) <= 'z') || \
4613 ((c) >= '0' && (c) <= '9') || \
4614 (c) == '+' || (c) == '/')
4615
4616 /* given that c is a base-64 character, what is its base-64 value? */
4617
4618 #define FROM_BASE64(c) \
4619 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4620 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4621 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4622 (c) == '+' ? 62 : 63)
4623
4624 /* What is the base-64 character of the bottom 6 bits of n? */
4625
4626 #define TO_BASE64(n) \
4627 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628
4629 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630 * decoded as itself. We are permissive on decoding; the only ASCII
4631 * byte not decoding to itself is the + which begins a base64
4632 * string. */
4633
4634 #define DECODE_DIRECT(c) \
4635 ((c) <= 127 && (c) != '+')
4636
4637 /* The UTF-7 encoder treats ASCII characters differently according to
4638 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639 * the above). See RFC2152. This array identifies these different
4640 * sets:
4641 * 0 : "Set D"
4642 * alphanumeric and '(),-./:?
4643 * 1 : "Set O"
4644 * !"#$%&*;<=>@[]^_`{|}
4645 * 2 : "whitespace"
4646 * ht nl cr sp
4647 * 3 : special (must be base64 encoded)
4648 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649 */
4650
4651 static
4652 char utf7_category[128] = {
4653 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4654 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4655 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4656 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4657 /* sp ! " # $ % & ' ( ) * + , - . / */
4658 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4659 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4661 /* @ A B C D E F G H I J K L M N O */
4662 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4663 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4665 /* ` a b c d e f g h i j k l m n o */
4666 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4667 /* p q r s t u v w x y z { | } ~ del */
4668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4669 };
4670
4671 /* ENCODE_DIRECT: this character should be encoded as itself. The
4672 * answer depends on whether we are encoding set O as itself, and also
4673 * on whether we are encoding whitespace as itself. RFC2152 makes it
4674 * clear that the answers to these questions vary between
4675 * applications, so this code needs to be flexible. */
4676
4677 #define ENCODE_DIRECT(c, directO, directWS) \
4678 ((c) < 128 && (c) > 0 && \
4679 ((utf7_category[(c)] == 0) || \
4680 (directWS && (utf7_category[(c)] == 2)) || \
4681 (directO && (utf7_category[(c)] == 1))))
4682
4683 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684 PyUnicode_DecodeUTF7(const char *s,
4685 Py_ssize_t size,
4686 const char *errors)
4687 {
4688 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689 }
4690
4691 /* The decoder. The only state we preserve is our read position,
4692 * i.e. how many characters we have consumed. So if we end in the
4693 * middle of a shift sequence we have to back off the read position
4694 * and the output to the beginning of the sequence, otherwise we lose
4695 * all the shift state (seen bits, number of bits seen, high
4696 * surrogate). */
4697
4698 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699 PyUnicode_DecodeUTF7Stateful(const char *s,
4700 Py_ssize_t size,
4701 const char *errors,
4702 Py_ssize_t *consumed)
4703 {
4704 const char *starts = s;
4705 Py_ssize_t startinpos;
4706 Py_ssize_t endinpos;
4707 const char *e;
4708 _PyUnicodeWriter writer;
4709 const char *errmsg = "";
4710 int inShift = 0;
4711 Py_ssize_t shiftOutStart;
4712 unsigned int base64bits = 0;
4713 unsigned long base64buffer = 0;
4714 Py_UCS4 surrogate = 0;
4715 PyObject *errorHandler = NULL;
4716 PyObject *exc = NULL;
4717
4718 if (size == 0) {
4719 if (consumed)
4720 *consumed = 0;
4721 _Py_RETURN_UNICODE_EMPTY();
4722 }
4723
4724 /* Start off assuming it's all ASCII. Widen later as necessary. */
4725 _PyUnicodeWriter_Init(&writer);
4726 writer.min_length = size;
4727
4728 shiftOutStart = 0;
4729 e = s + size;
4730
4731 while (s < e) {
4732 Py_UCS4 ch;
4733 restart:
4734 ch = (unsigned char) *s;
4735
4736 if (inShift) { /* in a base-64 section */
4737 if (IS_BASE64(ch)) { /* consume a base-64 character */
4738 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739 base64bits += 6;
4740 s++;
4741 if (base64bits >= 16) {
4742 /* we have enough bits for a UTF-16 value */
4743 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744 base64bits -= 16;
4745 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746 assert(outCh <= 0xffff);
4747 if (surrogate) {
4748 /* expecting a second surrogate */
4749 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752 goto onError;
4753 surrogate = 0;
4754 continue;
4755 }
4756 else {
4757 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758 goto onError;
4759 surrogate = 0;
4760 }
4761 }
4762 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763 /* first surrogate */
4764 surrogate = outCh;
4765 }
4766 else {
4767 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768 goto onError;
4769 }
4770 }
4771 }
4772 else { /* now leaving a base-64 section */
4773 inShift = 0;
4774 if (base64bits > 0) { /* left-over bits */
4775 if (base64bits >= 6) {
4776 /* We've seen at least one base-64 character */
4777 s++;
4778 errmsg = "partial character in shift sequence";
4779 goto utf7Error;
4780 }
4781 else {
4782 /* Some bits remain; they should be zero */
4783 if (base64buffer != 0) {
4784 s++;
4785 errmsg = "non-zero padding bits in shift sequence";
4786 goto utf7Error;
4787 }
4788 }
4789 }
4790 if (surrogate && DECODE_DIRECT(ch)) {
4791 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792 goto onError;
4793 }
4794 surrogate = 0;
4795 if (ch == '-') {
4796 /* '-' is absorbed; other terminating
4797 characters are preserved */
4798 s++;
4799 }
4800 }
4801 }
4802 else if ( ch == '+' ) {
4803 startinpos = s-starts;
4804 s++; /* consume '+' */
4805 if (s < e && *s == '-') { /* '+-' encodes '+' */
4806 s++;
4807 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808 goto onError;
4809 }
4810 else if (s < e && !IS_BASE64(*s)) {
4811 s++;
4812 errmsg = "ill-formed sequence";
4813 goto utf7Error;
4814 }
4815 else { /* begin base64-encoded section */
4816 inShift = 1;
4817 surrogate = 0;
4818 shiftOutStart = writer.pos;
4819 base64bits = 0;
4820 base64buffer = 0;
4821 }
4822 }
4823 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824 s++;
4825 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826 goto onError;
4827 }
4828 else {
4829 startinpos = s-starts;
4830 s++;
4831 errmsg = "unexpected special character";
4832 goto utf7Error;
4833 }
4834 continue;
4835 utf7Error:
4836 endinpos = s-starts;
4837 if (unicode_decode_call_errorhandler_writer(
4838 errors, &errorHandler,
4839 "utf7", errmsg,
4840 &starts, &e, &startinpos, &endinpos, &exc, &s,
4841 &writer))
4842 goto onError;
4843 }
4844
4845 /* end of string */
4846
4847 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848 /* if we're in an inconsistent state, that's an error */
4849 inShift = 0;
4850 if (surrogate ||
4851 (base64bits >= 6) ||
4852 (base64bits > 0 && base64buffer != 0)) {
4853 endinpos = size;
4854 if (unicode_decode_call_errorhandler_writer(
4855 errors, &errorHandler,
4856 "utf7", "unterminated shift sequence",
4857 &starts, &e, &startinpos, &endinpos, &exc, &s,
4858 &writer))
4859 goto onError;
4860 if (s < e)
4861 goto restart;
4862 }
4863 }
4864
4865 /* return state */
4866 if (consumed) {
4867 if (inShift) {
4868 *consumed = startinpos;
4869 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870 PyObject *result = PyUnicode_FromKindAndData(
4871 writer.kind, writer.data, shiftOutStart);
4872 Py_XDECREF(errorHandler);
4873 Py_XDECREF(exc);
4874 _PyUnicodeWriter_Dealloc(&writer);
4875 return result;
4876 }
4877 writer.pos = shiftOutStart; /* back off output */
4878 }
4879 else {
4880 *consumed = s-starts;
4881 }
4882 }
4883
4884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 return _PyUnicodeWriter_Finish(&writer);
4887
4888 onError:
4889 Py_XDECREF(errorHandler);
4890 Py_XDECREF(exc);
4891 _PyUnicodeWriter_Dealloc(&writer);
4892 return NULL;
4893 }
4894
4895
4896 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897 _PyUnicode_EncodeUTF7(PyObject *str,
4898 int base64SetO,
4899 int base64WhiteSpace,
4900 const char *errors)
4901 {
4902 int kind;
4903 const void *data;
4904 Py_ssize_t len;
4905 PyObject *v;
4906 int inShift = 0;
4907 Py_ssize_t i;
4908 unsigned int base64bits = 0;
4909 unsigned long base64buffer = 0;
4910 char * out;
4911 const char * start;
4912
4913 if (PyUnicode_READY(str) == -1)
4914 return NULL;
4915 kind = PyUnicode_KIND(str);
4916 data = PyUnicode_DATA(str);
4917 len = PyUnicode_GET_LENGTH(str);
4918
4919 if (len == 0)
4920 return PyBytes_FromStringAndSize(NULL, 0);
4921
4922 /* It might be possible to tighten this worst case */
4923 if (len > PY_SSIZE_T_MAX / 8)
4924 return PyErr_NoMemory();
4925 v = PyBytes_FromStringAndSize(NULL, len * 8);
4926 if (v == NULL)
4927 return NULL;
4928
4929 start = out = PyBytes_AS_STRING(v);
4930 for (i = 0; i < len; ++i) {
4931 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932
4933 if (inShift) {
4934 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935 /* shifting out */
4936 if (base64bits) { /* output remaining bits */
4937 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938 base64buffer = 0;
4939 base64bits = 0;
4940 }
4941 inShift = 0;
4942 /* Characters not in the BASE64 set implicitly unshift the sequence
4943 so no '-' is required, except if the character is itself a '-' */
4944 if (IS_BASE64(ch) || ch == '-') {
4945 *out++ = '-';
4946 }
4947 *out++ = (char) ch;
4948 }
4949 else {
4950 goto encode_char;
4951 }
4952 }
4953 else { /* not in a shift sequence */
4954 if (ch == '+') {
4955 *out++ = '+';
4956 *out++ = '-';
4957 }
4958 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959 *out++ = (char) ch;
4960 }
4961 else {
4962 *out++ = '+';
4963 inShift = 1;
4964 goto encode_char;
4965 }
4966 }
4967 continue;
4968 encode_char:
4969 if (ch >= 0x10000) {
4970 assert(ch <= MAX_UNICODE);
4971
4972 /* code first surrogate */
4973 base64bits += 16;
4974 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975 while (base64bits >= 6) {
4976 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977 base64bits -= 6;
4978 }
4979 /* prepare second surrogate */
4980 ch = Py_UNICODE_LOW_SURROGATE(ch);
4981 }
4982 base64bits += 16;
4983 base64buffer = (base64buffer << 16) | ch;
4984 while (base64bits >= 6) {
4985 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986 base64bits -= 6;
4987 }
4988 }
4989 if (base64bits)
4990 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991 if (inShift)
4992 *out++ = '-';
4993 if (_PyBytes_Resize(&v, out - start) < 0)
4994 return NULL;
4995 return v;
4996 }
4997
4998 #undef IS_BASE64
4999 #undef FROM_BASE64
5000 #undef TO_BASE64
5001 #undef DECODE_DIRECT
5002 #undef ENCODE_DIRECT
5003
5004 /* --- UTF-8 Codec -------------------------------------------------------- */
5005
5006 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007 PyUnicode_DecodeUTF8(const char *s,
5008 Py_ssize_t size,
5009 const char *errors)
5010 {
5011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012 }
5013
5014 #include "stringlib/asciilib.h"
5015 #include "stringlib/codecs.h"
5016 #include "stringlib/undef.h"
5017
5018 #include "stringlib/ucs1lib.h"
5019 #include "stringlib/codecs.h"
5020 #include "stringlib/undef.h"
5021
5022 #include "stringlib/ucs2lib.h"
5023 #include "stringlib/codecs.h"
5024 #include "stringlib/undef.h"
5025
5026 #include "stringlib/ucs4lib.h"
5027 #include "stringlib/codecs.h"
5028 #include "stringlib/undef.h"
5029
5030 /* Mask to quickly check whether a C 'size_t' contains a
5031 non-ASCII, UTF8-encoded char. */
5032 #if (SIZEOF_SIZE_T == 8)
5033 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5034 #elif (SIZEOF_SIZE_T == 4)
5035 # define ASCII_CHAR_MASK 0x80808080U
5036 #else
5037 # error C 'size_t' size should be either 4 or 8!
5038 #endif
5039
5040 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042 {
5043 const char *p = start;
5044
5045 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048 /* Fast path, see in STRINGLIB(utf8_decode) for
5049 an explanation. */
5050 /* Help allocation */
5051 const char *_p = p;
5052 Py_UCS1 * q = dest;
5053 while (_p + SIZEOF_SIZE_T <= end) {
5054 size_t value = *(const size_t *) _p;
5055 if (value & ASCII_CHAR_MASK)
5056 break;
5057 *((size_t *)q) = value;
5058 _p += SIZEOF_SIZE_T;
5059 q += SIZEOF_SIZE_T;
5060 }
5061 p = _p;
5062 while (p < end) {
5063 if ((unsigned char)*p & 0x80)
5064 break;
5065 *q++ = *p++;
5066 }
5067 return p - start;
5068 }
5069 #endif
5070 while (p < end) {
5071 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072 for an explanation. */
5073 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074 /* Help allocation */
5075 const char *_p = p;
5076 while (_p + SIZEOF_SIZE_T <= end) {
5077 size_t value = *(const size_t *) _p;
5078 if (value & ASCII_CHAR_MASK)
5079 break;
5080 _p += SIZEOF_SIZE_T;
5081 }
5082 p = _p;
5083 if (_p == end)
5084 break;
5085 }
5086 if ((unsigned char)*p & 0x80)
5087 break;
5088 ++p;
5089 }
5090 memcpy(dest, start, p - start);
5091 return p - start;
5092 }
5093
5094 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095 unicode_decode_utf8(const char *s, Py_ssize_t size,
5096 _Py_error_handler error_handler, const char *errors,
5097 Py_ssize_t *consumed)
5098 {
5099 if (size == 0) {
5100 if (consumed)
5101 *consumed = 0;
5102 _Py_RETURN_UNICODE_EMPTY();
5103 }
5104
5105 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106 if (size == 1 && (unsigned char)s[0] < 128) {
5107 if (consumed) {
5108 *consumed = 1;
5109 }
5110 return get_latin1_char((unsigned char)s[0]);
5111 }
5112
5113 const char *starts = s;
5114 const char *end = s + size;
5115
5116 // fast path: try ASCII string.
5117 PyObject *u = PyUnicode_New(size, 127);
5118 if (u == NULL) {
5119 return NULL;
5120 }
5121 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122 if (s == end) {
5123 return u;
5124 }
5125
5126 // Use _PyUnicodeWriter after fast path is failed.
5127 _PyUnicodeWriter writer;
5128 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129 writer.pos = s - starts;
5130
5131 Py_ssize_t startinpos, endinpos;
5132 const char *errmsg = "";
5133 PyObject *error_handler_obj = NULL;
5134 PyObject *exc = NULL;
5135
5136 while (s < end) {
5137 Py_UCS4 ch;
5138 int kind = writer.kind;
5139
5140 if (kind == PyUnicode_1BYTE_KIND) {
5141 if (PyUnicode_IS_ASCII(writer.buffer))
5142 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143 else
5144 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145 } else if (kind == PyUnicode_2BYTE_KIND) {
5146 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147 } else {
5148 assert(kind == PyUnicode_4BYTE_KIND);
5149 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150 }
5151
5152 switch (ch) {
5153 case 0:
5154 if (s == end || consumed)
5155 goto End;
5156 errmsg = "unexpected end of data";
5157 startinpos = s - starts;
5158 endinpos = end - starts;
5159 break;
5160 case 1:
5161 errmsg = "invalid start byte";
5162 startinpos = s - starts;
5163 endinpos = startinpos + 1;
5164 break;
5165 case 2:
5166 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168 {
5169 /* Truncated surrogate code in range D800-DFFF */
5170 goto End;
5171 }
5172 /* fall through */
5173 case 3:
5174 case 4:
5175 errmsg = "invalid continuation byte";
5176 startinpos = s - starts;
5177 endinpos = startinpos + ch - 1;
5178 break;
5179 default:
5180 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181 goto onError;
5182 continue;
5183 }
5184
5185 if (error_handler == _Py_ERROR_UNKNOWN)
5186 error_handler = _Py_GetErrorHandler(errors);
5187
5188 switch (error_handler) {
5189 case _Py_ERROR_IGNORE:
5190 s += (endinpos - startinpos);
5191 break;
5192
5193 case _Py_ERROR_REPLACE:
5194 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195 goto onError;
5196 s += (endinpos - startinpos);
5197 break;
5198
5199 case _Py_ERROR_SURROGATEESCAPE:
5200 {
5201 Py_ssize_t i;
5202
5203 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204 goto onError;
5205 for (i=startinpos; i<endinpos; i++) {
5206 ch = (Py_UCS4)(unsigned char)(starts[i]);
5207 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208 ch + 0xdc00);
5209 writer.pos++;
5210 }
5211 s += (endinpos - startinpos);
5212 break;
5213 }
5214
5215 default:
5216 if (unicode_decode_call_errorhandler_writer(
5217 errors, &error_handler_obj,
5218 "utf-8", errmsg,
5219 &starts, &end, &startinpos, &endinpos, &exc, &s,
5220 &writer))
5221 goto onError;
5222 }
5223 }
5224
5225 End:
5226 if (consumed)
5227 *consumed = s - starts;
5228
5229 Py_XDECREF(error_handler_obj);
5230 Py_XDECREF(exc);
5231 return _PyUnicodeWriter_Finish(&writer);
5232
5233 onError:
5234 Py_XDECREF(error_handler_obj);
5235 Py_XDECREF(exc);
5236 _PyUnicodeWriter_Dealloc(&writer);
5237 return NULL;
5238 }
5239
5240
5241 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242 PyUnicode_DecodeUTF8Stateful(const char *s,
5243 Py_ssize_t size,
5244 const char *errors,
5245 Py_ssize_t *consumed)
5246 {
5247 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248 }
5249
5250
5251 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252 non-zero, use strict error handler otherwise.
5253
5254 On success, write a pointer to a newly allocated wide character string into
5255 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256 (in number of wchar_t units) into *wlen (if wlen is set).
5257
5258 On memory allocation failure, return -1.
5259
5260 On decoding error (if surrogateescape is zero), return -2. If wlen is
5261 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262 is not NULL, write the decoding error message into *reason. */
5263 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265 const char **reason, _Py_error_handler errors)
5266 {
5267 const char *orig_s = s;
5268 const char *e;
5269 wchar_t *unicode;
5270 Py_ssize_t outpos;
5271
5272 int surrogateescape = 0;
5273 int surrogatepass = 0;
5274 switch (errors)
5275 {
5276 case _Py_ERROR_STRICT:
5277 break;
5278 case _Py_ERROR_SURROGATEESCAPE:
5279 surrogateescape = 1;
5280 break;
5281 case _Py_ERROR_SURROGATEPASS:
5282 surrogatepass = 1;
5283 break;
5284 default:
5285 return -3;
5286 }
5287
5288 /* Note: size will always be longer than the resulting Unicode
5289 character count */
5290 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291 return -1;
5292 }
5293
5294 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295 if (!unicode) {
5296 return -1;
5297 }
5298
5299 /* Unpack UTF-8 encoded data */
5300 e = s + size;
5301 outpos = 0;
5302 while (s < e) {
5303 Py_UCS4 ch;
5304 #if SIZEOF_WCHAR_T == 4
5305 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306 #else
5307 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308 #endif
5309 if (ch > 0xFF) {
5310 #if SIZEOF_WCHAR_T == 4
5311 Py_UNREACHABLE();
5312 #else
5313 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314 /* write a surrogate pair */
5315 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317 #endif
5318 }
5319 else {
5320 if (!ch && s == e) {
5321 break;
5322 }
5323
5324 if (surrogateescape) {
5325 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326 }
5327 else {
5328 /* Is it a valid three-byte code? */
5329 if (surrogatepass
5330 && (e - s) >= 3
5331 && (s[0] & 0xf0) == 0xe0
5332 && (s[1] & 0xc0) == 0x80
5333 && (s[2] & 0xc0) == 0x80)
5334 {
5335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336 s += 3;
5337 unicode[outpos++] = ch;
5338 }
5339 else {
5340 PyMem_RawFree(unicode );
5341 if (reason != NULL) {
5342 switch (ch) {
5343 case 0:
5344 *reason = "unexpected end of data";
5345 break;
5346 case 1:
5347 *reason = "invalid start byte";
5348 break;
5349 /* 2, 3, 4 */
5350 default:
5351 *reason = "invalid continuation byte";
5352 break;
5353 }
5354 }
5355 if (wlen != NULL) {
5356 *wlen = s - orig_s;
5357 }
5358 return -2;
5359 }
5360 }
5361 }
5362 }
5363 unicode[outpos] = L'\0';
5364 if (wlen) {
5365 *wlen = outpos;
5366 }
5367 *wstr = unicode;
5368 return 0;
5369 }
5370
5371
5372 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374 size_t *wlen)
5375 {
5376 wchar_t *wstr;
5377 int res = _Py_DecodeUTF8Ex(arg, arglen,
5378 &wstr, wlen,
5379 NULL, _Py_ERROR_SURROGATEESCAPE);
5380 if (res != 0) {
5381 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382 assert(res != -3);
5383 if (wlen) {
5384 *wlen = (size_t)res;
5385 }
5386 return NULL;
5387 }
5388 return wstr;
5389 }
5390
5391
5392 /* UTF-8 encoder using the surrogateescape error handler .
5393
5394 On success, return 0 and write the newly allocated character string (use
5395 PyMem_Free() to free the memory) into *str.
5396
5397 On encoding failure, return -2 and write the position of the invalid
5398 surrogate character into *error_pos (if error_pos is set) and the decoding
5399 error message into *reason (if reason is set).
5400
5401 On memory allocation failure, return -1. */
5402 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404 const char **reason, int raw_malloc, _Py_error_handler errors)
5405 {
5406 const Py_ssize_t max_char_size = 4;
5407 Py_ssize_t len = wcslen(text);
5408
5409 assert(len >= 0);
5410
5411 int surrogateescape = 0;
5412 int surrogatepass = 0;
5413 switch (errors)
5414 {
5415 case _Py_ERROR_STRICT:
5416 break;
5417 case _Py_ERROR_SURROGATEESCAPE:
5418 surrogateescape = 1;
5419 break;
5420 case _Py_ERROR_SURROGATEPASS:
5421 surrogatepass = 1;
5422 break;
5423 default:
5424 return -3;
5425 }
5426
5427 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428 return -1;
5429 }
5430 char *bytes;
5431 if (raw_malloc) {
5432 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433 }
5434 else {
5435 bytes = PyMem_Malloc((len + 1) * max_char_size);
5436 }
5437 if (bytes == NULL) {
5438 return -1;
5439 }
5440
5441 char *p = bytes;
5442 Py_ssize_t i;
5443 for (i = 0; i < len; ) {
5444 Py_ssize_t ch_pos = i;
5445 Py_UCS4 ch = text[i];
5446 i++;
5447 #if Py_UNICODE_SIZE == 2
5448 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449 && i < len
5450 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451 {
5452 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453 i++;
5454 }
5455 #endif
5456
5457 if (ch < 0x80) {
5458 /* Encode ASCII */
5459 *p++ = (char) ch;
5460
5461 }
5462 else if (ch < 0x0800) {
5463 /* Encode Latin-1 */
5464 *p++ = (char)(0xc0 | (ch >> 6));
5465 *p++ = (char)(0x80 | (ch & 0x3f));
5466 }
5467 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468 /* surrogateescape error handler */
5469 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470 if (error_pos != NULL) {
5471 *error_pos = (size_t)ch_pos;
5472 }
5473 if (reason != NULL) {
5474 *reason = "encoding error";
5475 }
5476 if (raw_malloc) {
5477 PyMem_RawFree(bytes);
5478 }
5479 else {
5480 PyMem_Free(bytes);
5481 }
5482 return -2;
5483 }
5484 *p++ = (char)(ch & 0xff);
5485 }
5486 else if (ch < 0x10000) {
5487 *p++ = (char)(0xe0 | (ch >> 12));
5488 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489 *p++ = (char)(0x80 | (ch & 0x3f));
5490 }
5491 else { /* ch >= 0x10000 */
5492 assert(ch <= MAX_UNICODE);
5493 /* Encode UCS4 Unicode ordinals */
5494 *p++ = (char)(0xf0 | (ch >> 18));
5495 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497 *p++ = (char)(0x80 | (ch & 0x3f));
5498 }
5499 }
5500 *p++ = '\0';
5501
5502 size_t final_size = (p - bytes);
5503 char *bytes2;
5504 if (raw_malloc) {
5505 bytes2 = PyMem_RawRealloc(bytes, final_size);
5506 }
5507 else {
5508 bytes2 = PyMem_Realloc(bytes, final_size);
5509 }
5510 if (bytes2 == NULL) {
5511 if (error_pos != NULL) {
5512 *error_pos = (size_t)-1;
5513 }
5514 if (raw_malloc) {
5515 PyMem_RawFree(bytes);
5516 }
5517 else {
5518 PyMem_Free(bytes);
5519 }
5520 return -1;
5521 }
5522 *str = bytes2;
5523 return 0;
5524 }
5525
5526
5527 /* Primary internal function which creates utf8 encoded bytes objects.
5528
5529 Allocation strategy: if the string is short, convert into a stack buffer
5530 and allocate exactly as much space needed at the end. Else allocate the
5531 maximum possible needed (4 result bytes per Unicode character), and return
5532 the excess memory at the end.
5533 */
5534 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536 const char *errors)
5537 {
5538 if (!PyUnicode_Check(unicode)) {
5539 PyErr_BadArgument();
5540 return NULL;
5541 }
5542
5543 if (PyUnicode_READY(unicode) == -1)
5544 return NULL;
5545
5546 if (PyUnicode_UTF8(unicode))
5547 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548 PyUnicode_UTF8_LENGTH(unicode));
5549
5550 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551 const void *data = PyUnicode_DATA(unicode);
5552 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553
5554 _PyBytesWriter writer;
5555 char *end;
5556
5557 switch (kind) {
5558 default:
5559 Py_UNREACHABLE();
5560 case PyUnicode_1BYTE_KIND:
5561 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562 assert(!PyUnicode_IS_ASCII(unicode));
5563 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564 break;
5565 case PyUnicode_2BYTE_KIND:
5566 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567 break;
5568 case PyUnicode_4BYTE_KIND:
5569 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570 break;
5571 }
5572
5573 if (end == NULL) {
5574 _PyBytesWriter_Dealloc(&writer);
5575 return NULL;
5576 }
5577 return _PyBytesWriter_Finish(&writer, end);
5578 }
5579
5580 static int
unicode_fill_utf8(PyObject * unicode)5581 unicode_fill_utf8(PyObject *unicode)
5582 {
5583 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584 assert(!PyUnicode_IS_ASCII(unicode));
5585
5586 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587 const void *data = PyUnicode_DATA(unicode);
5588 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589
5590 _PyBytesWriter writer;
5591 char *end;
5592
5593 switch (kind) {
5594 default:
5595 Py_UNREACHABLE();
5596 case PyUnicode_1BYTE_KIND:
5597 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598 _Py_ERROR_STRICT, NULL);
5599 break;
5600 case PyUnicode_2BYTE_KIND:
5601 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602 _Py_ERROR_STRICT, NULL);
5603 break;
5604 case PyUnicode_4BYTE_KIND:
5605 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606 _Py_ERROR_STRICT, NULL);
5607 break;
5608 }
5609 if (end == NULL) {
5610 _PyBytesWriter_Dealloc(&writer);
5611 return -1;
5612 }
5613
5614 const char *start = writer.use_small_buffer ? writer.small_buffer :
5615 PyBytes_AS_STRING(writer.buffer);
5616 Py_ssize_t len = end - start;
5617
5618 char *cache = PyObject_Malloc(len + 1);
5619 if (cache == NULL) {
5620 _PyBytesWriter_Dealloc(&writer);
5621 PyErr_NoMemory();
5622 return -1;
5623 }
5624 _PyUnicode_UTF8(unicode) = cache;
5625 _PyUnicode_UTF8_LENGTH(unicode) = len;
5626 memcpy(cache, start, len);
5627 cache[len] = '\0';
5628 _PyBytesWriter_Dealloc(&writer);
5629 return 0;
5630 }
5631
5632 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634 {
5635 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636 }
5637
5638
5639 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5640 PyUnicode_AsUTF8String(PyObject *unicode)
5641 {
5642 return _PyUnicode_AsUTF8String(unicode, NULL);
5643 }
5644
5645 /* --- UTF-32 Codec ------------------------------------------------------- */
5646
5647 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648 PyUnicode_DecodeUTF32(const char *s,
5649 Py_ssize_t size,
5650 const char *errors,
5651 int *byteorder)
5652 {
5653 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654 }
5655
5656 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657 PyUnicode_DecodeUTF32Stateful(const char *s,
5658 Py_ssize_t size,
5659 const char *errors,
5660 int *byteorder,
5661 Py_ssize_t *consumed)
5662 {
5663 const char *starts = s;
5664 Py_ssize_t startinpos;
5665 Py_ssize_t endinpos;
5666 _PyUnicodeWriter writer;
5667 const unsigned char *q, *e;
5668 int le, bo = 0; /* assume native ordering by default */
5669 const char *encoding;
5670 const char *errmsg = "";
5671 PyObject *errorHandler = NULL;
5672 PyObject *exc = NULL;
5673
5674 q = (const unsigned char *)s;
5675 e = q + size;
5676
5677 if (byteorder)
5678 bo = *byteorder;
5679
5680 /* Check for BOM marks (U+FEFF) in the input and adjust current
5681 byte order setting accordingly. In native mode, the leading BOM
5682 mark is skipped, in all other modes, it is copied to the output
5683 stream as-is (giving a ZWNBSP character). */
5684 if (bo == 0 && size >= 4) {
5685 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686 if (bom == 0x0000FEFF) {
5687 bo = -1;
5688 q += 4;
5689 }
5690 else if (bom == 0xFFFE0000) {
5691 bo = 1;
5692 q += 4;
5693 }
5694 if (byteorder)
5695 *byteorder = bo;
5696 }
5697
5698 if (q == e) {
5699 if (consumed)
5700 *consumed = size;
5701 _Py_RETURN_UNICODE_EMPTY();
5702 }
5703
5704 #ifdef WORDS_BIGENDIAN
5705 le = bo < 0;
5706 #else
5707 le = bo <= 0;
5708 #endif
5709 encoding = le ? "utf-32-le" : "utf-32-be";
5710
5711 _PyUnicodeWriter_Init(&writer);
5712 writer.min_length = (e - q + 3) / 4;
5713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714 goto onError;
5715
5716 while (1) {
5717 Py_UCS4 ch = 0;
5718 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719
5720 if (e - q >= 4) {
5721 enum PyUnicode_Kind kind = writer.kind;
5722 void *data = writer.data;
5723 const unsigned char *last = e - 4;
5724 Py_ssize_t pos = writer.pos;
5725 if (le) {
5726 do {
5727 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728 if (ch > maxch)
5729 break;
5730 if (kind != PyUnicode_1BYTE_KIND &&
5731 Py_UNICODE_IS_SURROGATE(ch))
5732 break;
5733 PyUnicode_WRITE(kind, data, pos++, ch);
5734 q += 4;
5735 } while (q <= last);
5736 }
5737 else {
5738 do {
5739 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740 if (ch > maxch)
5741 break;
5742 if (kind != PyUnicode_1BYTE_KIND &&
5743 Py_UNICODE_IS_SURROGATE(ch))
5744 break;
5745 PyUnicode_WRITE(kind, data, pos++, ch);
5746 q += 4;
5747 } while (q <= last);
5748 }
5749 writer.pos = pos;
5750 }
5751
5752 if (Py_UNICODE_IS_SURROGATE(ch)) {
5753 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754 startinpos = ((const char *)q) - starts;
5755 endinpos = startinpos + 4;
5756 }
5757 else if (ch <= maxch) {
5758 if (q == e || consumed)
5759 break;
5760 /* remaining bytes at the end? (size should be divisible by 4) */
5761 errmsg = "truncated data";
5762 startinpos = ((const char *)q) - starts;
5763 endinpos = ((const char *)e) - starts;
5764 }
5765 else {
5766 if (ch < 0x110000) {
5767 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768 goto onError;
5769 q += 4;
5770 continue;
5771 }
5772 errmsg = "code point not in range(0x110000)";
5773 startinpos = ((const char *)q) - starts;
5774 endinpos = startinpos + 4;
5775 }
5776
5777 /* The remaining input chars are ignored if the callback
5778 chooses to skip the input */
5779 if (unicode_decode_call_errorhandler_writer(
5780 errors, &errorHandler,
5781 encoding, errmsg,
5782 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783 &writer))
5784 goto onError;
5785 }
5786
5787 if (consumed)
5788 *consumed = (const char *)q-starts;
5789
5790 Py_XDECREF(errorHandler);
5791 Py_XDECREF(exc);
5792 return _PyUnicodeWriter_Finish(&writer);
5793
5794 onError:
5795 _PyUnicodeWriter_Dealloc(&writer);
5796 Py_XDECREF(errorHandler);
5797 Py_XDECREF(exc);
5798 return NULL;
5799 }
5800
5801 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802 _PyUnicode_EncodeUTF32(PyObject *str,
5803 const char *errors,
5804 int byteorder)
5805 {
5806 enum PyUnicode_Kind kind;
5807 const void *data;
5808 Py_ssize_t len;
5809 PyObject *v;
5810 uint32_t *out;
5811 #if PY_LITTLE_ENDIAN
5812 int native_ordering = byteorder <= 0;
5813 #else
5814 int native_ordering = byteorder >= 0;
5815 #endif
5816 const char *encoding;
5817 Py_ssize_t nsize, pos;
5818 PyObject *errorHandler = NULL;
5819 PyObject *exc = NULL;
5820 PyObject *rep = NULL;
5821
5822 if (!PyUnicode_Check(str)) {
5823 PyErr_BadArgument();
5824 return NULL;
5825 }
5826 if (PyUnicode_READY(str) == -1)
5827 return NULL;
5828 kind = PyUnicode_KIND(str);
5829 data = PyUnicode_DATA(str);
5830 len = PyUnicode_GET_LENGTH(str);
5831
5832 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833 return PyErr_NoMemory();
5834 nsize = len + (byteorder == 0);
5835 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836 if (v == NULL)
5837 return NULL;
5838
5839 /* output buffer is 4-bytes aligned */
5840 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841 out = (uint32_t *)PyBytes_AS_STRING(v);
5842 if (byteorder == 0)
5843 *out++ = 0xFEFF;
5844 if (len == 0)
5845 goto done;
5846
5847 if (byteorder == -1)
5848 encoding = "utf-32-le";
5849 else if (byteorder == 1)
5850 encoding = "utf-32-be";
5851 else
5852 encoding = "utf-32";
5853
5854 if (kind == PyUnicode_1BYTE_KIND) {
5855 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856 goto done;
5857 }
5858
5859 pos = 0;
5860 while (pos < len) {
5861 Py_ssize_t newpos, repsize, moreunits;
5862
5863 if (kind == PyUnicode_2BYTE_KIND) {
5864 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865 &out, native_ordering);
5866 }
5867 else {
5868 assert(kind == PyUnicode_4BYTE_KIND);
5869 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870 &out, native_ordering);
5871 }
5872 if (pos == len)
5873 break;
5874
5875 rep = unicode_encode_call_errorhandler(
5876 errors, &errorHandler,
5877 encoding, "surrogates not allowed",
5878 str, &exc, pos, pos + 1, &newpos);
5879 if (!rep)
5880 goto error;
5881
5882 if (PyBytes_Check(rep)) {
5883 repsize = PyBytes_GET_SIZE(rep);
5884 if (repsize & 3) {
5885 raise_encode_exception(&exc, encoding,
5886 str, pos, pos + 1,
5887 "surrogates not allowed");
5888 goto error;
5889 }
5890 moreunits = repsize / 4;
5891 }
5892 else {
5893 assert(PyUnicode_Check(rep));
5894 if (PyUnicode_READY(rep) < 0)
5895 goto error;
5896 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897 if (!PyUnicode_IS_ASCII(rep)) {
5898 raise_encode_exception(&exc, encoding,
5899 str, pos, pos + 1,
5900 "surrogates not allowed");
5901 goto error;
5902 }
5903 }
5904 moreunits += pos - newpos;
5905 pos = newpos;
5906
5907 /* four bytes are reserved for each surrogate */
5908 if (moreunits > 0) {
5909 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911 /* integer overflow */
5912 PyErr_NoMemory();
5913 goto error;
5914 }
5915 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916 goto error;
5917 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918 }
5919
5920 if (PyBytes_Check(rep)) {
5921 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922 out += repsize / 4;
5923 } else /* rep is unicode */ {
5924 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926 &out, native_ordering);
5927 }
5928
5929 Py_CLEAR(rep);
5930 }
5931
5932 /* Cut back to size actually needed. This is necessary for, for example,
5933 encoding of a string containing isolated surrogates and the 'ignore'
5934 handler is used. */
5935 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936 if (nsize != PyBytes_GET_SIZE(v))
5937 _PyBytes_Resize(&v, nsize);
5938 Py_XDECREF(errorHandler);
5939 Py_XDECREF(exc);
5940 done:
5941 return v;
5942 error:
5943 Py_XDECREF(rep);
5944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
5946 Py_XDECREF(v);
5947 return NULL;
5948 }
5949
5950 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5951 PyUnicode_AsUTF32String(PyObject *unicode)
5952 {
5953 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954 }
5955
5956 /* --- UTF-16 Codec ------------------------------------------------------- */
5957
5958 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959 PyUnicode_DecodeUTF16(const char *s,
5960 Py_ssize_t size,
5961 const char *errors,
5962 int *byteorder)
5963 {
5964 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965 }
5966
5967 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968 PyUnicode_DecodeUTF16Stateful(const char *s,
5969 Py_ssize_t size,
5970 const char *errors,
5971 int *byteorder,
5972 Py_ssize_t *consumed)
5973 {
5974 const char *starts = s;
5975 Py_ssize_t startinpos;
5976 Py_ssize_t endinpos;
5977 _PyUnicodeWriter writer;
5978 const unsigned char *q, *e;
5979 int bo = 0; /* assume native ordering by default */
5980 int native_ordering;
5981 const char *errmsg = "";
5982 PyObject *errorHandler = NULL;
5983 PyObject *exc = NULL;
5984 const char *encoding;
5985
5986 q = (const unsigned char *)s;
5987 e = q + size;
5988
5989 if (byteorder)
5990 bo = *byteorder;
5991
5992 /* Check for BOM marks (U+FEFF) in the input and adjust current
5993 byte order setting accordingly. In native mode, the leading BOM
5994 mark is skipped, in all other modes, it is copied to the output
5995 stream as-is (giving a ZWNBSP character). */
5996 if (bo == 0 && size >= 2) {
5997 const Py_UCS4 bom = (q[1] << 8) | q[0];
5998 if (bom == 0xFEFF) {
5999 q += 2;
6000 bo = -1;
6001 }
6002 else if (bom == 0xFFFE) {
6003 q += 2;
6004 bo = 1;
6005 }
6006 if (byteorder)
6007 *byteorder = bo;
6008 }
6009
6010 if (q == e) {
6011 if (consumed)
6012 *consumed = size;
6013 _Py_RETURN_UNICODE_EMPTY();
6014 }
6015
6016 #if PY_LITTLE_ENDIAN
6017 native_ordering = bo <= 0;
6018 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019 #else
6020 native_ordering = bo >= 0;
6021 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022 #endif
6023
6024 /* Note: size will always be longer than the resulting Unicode
6025 character count normally. Error handler will take care of
6026 resizing when needed. */
6027 _PyUnicodeWriter_Init(&writer);
6028 writer.min_length = (e - q + 1) / 2;
6029 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030 goto onError;
6031
6032 while (1) {
6033 Py_UCS4 ch = 0;
6034 if (e - q >= 2) {
6035 int kind = writer.kind;
6036 if (kind == PyUnicode_1BYTE_KIND) {
6037 if (PyUnicode_IS_ASCII(writer.buffer))
6038 ch = asciilib_utf16_decode(&q, e,
6039 (Py_UCS1*)writer.data, &writer.pos,
6040 native_ordering);
6041 else
6042 ch = ucs1lib_utf16_decode(&q, e,
6043 (Py_UCS1*)writer.data, &writer.pos,
6044 native_ordering);
6045 } else if (kind == PyUnicode_2BYTE_KIND) {
6046 ch = ucs2lib_utf16_decode(&q, e,
6047 (Py_UCS2*)writer.data, &writer.pos,
6048 native_ordering);
6049 } else {
6050 assert(kind == PyUnicode_4BYTE_KIND);
6051 ch = ucs4lib_utf16_decode(&q, e,
6052 (Py_UCS4*)writer.data, &writer.pos,
6053 native_ordering);
6054 }
6055 }
6056
6057 switch (ch)
6058 {
6059 case 0:
6060 /* remaining byte at the end? (size should be even) */
6061 if (q == e || consumed)
6062 goto End;
6063 errmsg = "truncated data";
6064 startinpos = ((const char *)q) - starts;
6065 endinpos = ((const char *)e) - starts;
6066 break;
6067 /* The remaining input chars are ignored if the callback
6068 chooses to skip the input */
6069 case 1:
6070 q -= 2;
6071 if (consumed)
6072 goto End;
6073 errmsg = "unexpected end of data";
6074 startinpos = ((const char *)q) - starts;
6075 endinpos = ((const char *)e) - starts;
6076 break;
6077 case 2:
6078 errmsg = "illegal encoding";
6079 startinpos = ((const char *)q) - 2 - starts;
6080 endinpos = startinpos + 2;
6081 break;
6082 case 3:
6083 errmsg = "illegal UTF-16 surrogate";
6084 startinpos = ((const char *)q) - 4 - starts;
6085 endinpos = startinpos + 2;
6086 break;
6087 default:
6088 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089 goto onError;
6090 continue;
6091 }
6092
6093 if (unicode_decode_call_errorhandler_writer(
6094 errors,
6095 &errorHandler,
6096 encoding, errmsg,
6097 &starts,
6098 (const char **)&e,
6099 &startinpos,
6100 &endinpos,
6101 &exc,
6102 (const char **)&q,
6103 &writer))
6104 goto onError;
6105 }
6106
6107 End:
6108 if (consumed)
6109 *consumed = (const char *)q-starts;
6110
6111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
6113 return _PyUnicodeWriter_Finish(&writer);
6114
6115 onError:
6116 _PyUnicodeWriter_Dealloc(&writer);
6117 Py_XDECREF(errorHandler);
6118 Py_XDECREF(exc);
6119 return NULL;
6120 }
6121
6122 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123 _PyUnicode_EncodeUTF16(PyObject *str,
6124 const char *errors,
6125 int byteorder)
6126 {
6127 enum PyUnicode_Kind kind;
6128 const void *data;
6129 Py_ssize_t len;
6130 PyObject *v;
6131 unsigned short *out;
6132 Py_ssize_t pairs;
6133 #if PY_BIG_ENDIAN
6134 int native_ordering = byteorder >= 0;
6135 #else
6136 int native_ordering = byteorder <= 0;
6137 #endif
6138 const char *encoding;
6139 Py_ssize_t nsize, pos;
6140 PyObject *errorHandler = NULL;
6141 PyObject *exc = NULL;
6142 PyObject *rep = NULL;
6143
6144 if (!PyUnicode_Check(str)) {
6145 PyErr_BadArgument();
6146 return NULL;
6147 }
6148 if (PyUnicode_READY(str) == -1)
6149 return NULL;
6150 kind = PyUnicode_KIND(str);
6151 data = PyUnicode_DATA(str);
6152 len = PyUnicode_GET_LENGTH(str);
6153
6154 pairs = 0;
6155 if (kind == PyUnicode_4BYTE_KIND) {
6156 const Py_UCS4 *in = (const Py_UCS4 *)data;
6157 const Py_UCS4 *end = in + len;
6158 while (in < end) {
6159 if (*in++ >= 0x10000) {
6160 pairs++;
6161 }
6162 }
6163 }
6164 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165 return PyErr_NoMemory();
6166 }
6167 nsize = len + pairs + (byteorder == 0);
6168 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169 if (v == NULL) {
6170 return NULL;
6171 }
6172
6173 /* output buffer is 2-bytes aligned */
6174 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175 out = (unsigned short *)PyBytes_AS_STRING(v);
6176 if (byteorder == 0) {
6177 *out++ = 0xFEFF;
6178 }
6179 if (len == 0) {
6180 goto done;
6181 }
6182
6183 if (kind == PyUnicode_1BYTE_KIND) {
6184 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185 goto done;
6186 }
6187
6188 if (byteorder < 0) {
6189 encoding = "utf-16-le";
6190 }
6191 else if (byteorder > 0) {
6192 encoding = "utf-16-be";
6193 }
6194 else {
6195 encoding = "utf-16";
6196 }
6197
6198 pos = 0;
6199 while (pos < len) {
6200 Py_ssize_t newpos, repsize, moreunits;
6201
6202 if (kind == PyUnicode_2BYTE_KIND) {
6203 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204 &out, native_ordering);
6205 }
6206 else {
6207 assert(kind == PyUnicode_4BYTE_KIND);
6208 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209 &out, native_ordering);
6210 }
6211 if (pos == len)
6212 break;
6213
6214 rep = unicode_encode_call_errorhandler(
6215 errors, &errorHandler,
6216 encoding, "surrogates not allowed",
6217 str, &exc, pos, pos + 1, &newpos);
6218 if (!rep)
6219 goto error;
6220
6221 if (PyBytes_Check(rep)) {
6222 repsize = PyBytes_GET_SIZE(rep);
6223 if (repsize & 1) {
6224 raise_encode_exception(&exc, encoding,
6225 str, pos, pos + 1,
6226 "surrogates not allowed");
6227 goto error;
6228 }
6229 moreunits = repsize / 2;
6230 }
6231 else {
6232 assert(PyUnicode_Check(rep));
6233 if (PyUnicode_READY(rep) < 0)
6234 goto error;
6235 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236 if (!PyUnicode_IS_ASCII(rep)) {
6237 raise_encode_exception(&exc, encoding,
6238 str, pos, pos + 1,
6239 "surrogates not allowed");
6240 goto error;
6241 }
6242 }
6243 moreunits += pos - newpos;
6244 pos = newpos;
6245
6246 /* two bytes are reserved for each surrogate */
6247 if (moreunits > 0) {
6248 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250 /* integer overflow */
6251 PyErr_NoMemory();
6252 goto error;
6253 }
6254 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255 goto error;
6256 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257 }
6258
6259 if (PyBytes_Check(rep)) {
6260 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261 out += repsize / 2;
6262 } else /* rep is unicode */ {
6263 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265 &out, native_ordering);
6266 }
6267
6268 Py_CLEAR(rep);
6269 }
6270
6271 /* Cut back to size actually needed. This is necessary for, for example,
6272 encoding of a string containing isolated surrogates and the 'ignore' handler
6273 is used. */
6274 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275 if (nsize != PyBytes_GET_SIZE(v))
6276 _PyBytes_Resize(&v, nsize);
6277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
6279 done:
6280 return v;
6281 error:
6282 Py_XDECREF(rep);
6283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(v);
6286 return NULL;
6287 #undef STORECHAR
6288 }
6289
6290 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6291 PyUnicode_AsUTF16String(PyObject *unicode)
6292 {
6293 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294 }
6295
6296 /* --- Unicode Escape Codec ----------------------------------------------- */
6297
6298 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299
6300 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal2(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,int * first_invalid_escape_char,const char ** first_invalid_escape_ptr)6301 _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6302 Py_ssize_t size,
6303 const char *errors,
6304 Py_ssize_t *consumed,
6305 int *first_invalid_escape_char,
6306 const char **first_invalid_escape_ptr)
6307 {
6308 const char *starts = s;
6309 const char *initial_starts = starts;
6310 _PyUnicodeWriter writer;
6311 const char *end;
6312 PyObject *errorHandler = NULL;
6313 PyObject *exc = NULL;
6314
6315 // so we can remember if we've seen an invalid escape char or not
6316 *first_invalid_escape_char = -1;
6317 *first_invalid_escape_ptr = NULL;
6318
6319 if (size == 0) {
6320 if (consumed) {
6321 *consumed = 0;
6322 }
6323 _Py_RETURN_UNICODE_EMPTY();
6324 }
6325 /* Escaped strings will always be longer than the resulting
6326 Unicode string, so we start with size here and then reduce the
6327 length after conversion to the true value.
6328 (but if the error callback returns a long replacement string
6329 we'll have to allocate more space) */
6330 _PyUnicodeWriter_Init(&writer);
6331 writer.min_length = size;
6332 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6333 goto onError;
6334 }
6335
6336 end = s + size;
6337 while (s < end) {
6338 unsigned char c = (unsigned char) *s++;
6339 Py_UCS4 ch;
6340 int count;
6341 const char *message;
6342
6343 #define WRITE_ASCII_CHAR(ch) \
6344 do { \
6345 assert(ch <= 127); \
6346 assert(writer.pos < writer.size); \
6347 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6348 } while(0)
6349
6350 #define WRITE_CHAR(ch) \
6351 do { \
6352 if (ch <= writer.maxchar) { \
6353 assert(writer.pos < writer.size); \
6354 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6355 } \
6356 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6357 goto onError; \
6358 } \
6359 } while(0)
6360
6361 /* Non-escape characters are interpreted as Unicode ordinals */
6362 if (c != '\\') {
6363 WRITE_CHAR(c);
6364 continue;
6365 }
6366
6367 Py_ssize_t startinpos = s - starts - 1;
6368 /* \ - Escapes */
6369 if (s >= end) {
6370 message = "\\ at end of string";
6371 goto incomplete;
6372 }
6373 c = (unsigned char) *s++;
6374
6375 assert(writer.pos < writer.size);
6376 switch (c) {
6377
6378 /* \x escapes */
6379 case '\n': continue;
6380 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6381 case '\'': WRITE_ASCII_CHAR('\''); continue;
6382 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6383 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6384 /* FF */
6385 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6386 case 't': WRITE_ASCII_CHAR('\t'); continue;
6387 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6388 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6389 /* VT */
6390 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6391 /* BEL, not classic C */
6392 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6393
6394 /* \OOO (octal) escapes */
6395 case '0': case '1': case '2': case '3':
6396 case '4': case '5': case '6': case '7':
6397 ch = c - '0';
6398 if (s < end && '0' <= *s && *s <= '7') {
6399 ch = (ch<<3) + *s++ - '0';
6400 if (s < end && '0' <= *s && *s <= '7') {
6401 ch = (ch<<3) + *s++ - '0';
6402 }
6403 }
6404 if (ch > 0377) {
6405 if (*first_invalid_escape_char == -1) {
6406 *first_invalid_escape_char = ch;
6407 if (starts == initial_starts) {
6408 /* Back up 3 chars, since we've already incremented s. */
6409 *first_invalid_escape_ptr = s - 3;
6410 }
6411 }
6412 }
6413 WRITE_CHAR(ch);
6414 continue;
6415
6416 /* hex escapes */
6417 /* \xXX */
6418 case 'x':
6419 count = 2;
6420 message = "truncated \\xXX escape";
6421 goto hexescape;
6422
6423 /* \uXXXX */
6424 case 'u':
6425 count = 4;
6426 message = "truncated \\uXXXX escape";
6427 goto hexescape;
6428
6429 /* \UXXXXXXXX */
6430 case 'U':
6431 count = 8;
6432 message = "truncated \\UXXXXXXXX escape";
6433 hexescape:
6434 for (ch = 0; count; ++s, --count) {
6435 if (s >= end) {
6436 goto incomplete;
6437 }
6438 c = (unsigned char)*s;
6439 ch <<= 4;
6440 if (c >= '0' && c <= '9') {
6441 ch += c - '0';
6442 }
6443 else if (c >= 'a' && c <= 'f') {
6444 ch += c - ('a' - 10);
6445 }
6446 else if (c >= 'A' && c <= 'F') {
6447 ch += c - ('A' - 10);
6448 }
6449 else {
6450 goto error;
6451 }
6452 }
6453
6454 /* when we get here, ch is a 32-bit unicode character */
6455 if (ch > MAX_UNICODE) {
6456 message = "illegal Unicode character";
6457 goto error;
6458 }
6459
6460 WRITE_CHAR(ch);
6461 continue;
6462
6463 /* \N{name} */
6464 case 'N':
6465 if (ucnhash_capi == NULL) {
6466 /* load the unicode data module */
6467 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6468 PyUnicodeData_CAPSULE_NAME, 1);
6469 if (ucnhash_capi == NULL) {
6470 PyErr_SetString(
6471 PyExc_UnicodeError,
6472 "\\N escapes not supported (can't load unicodedata module)"
6473 );
6474 goto onError;
6475 }
6476 }
6477
6478 message = "malformed \\N character escape";
6479 if (s >= end) {
6480 goto incomplete;
6481 }
6482 if (*s == '{') {
6483 const char *start = ++s;
6484 size_t namelen;
6485 /* look for the closing brace */
6486 while (s < end && *s != '}')
6487 s++;
6488 if (s >= end) {
6489 goto incomplete;
6490 }
6491 namelen = s - start;
6492 if (namelen) {
6493 /* found a name. look it up in the unicode database */
6494 s++;
6495 ch = 0xffffffff; /* in case 'getcode' messes up */
6496 if (namelen <= INT_MAX &&
6497 ucnhash_capi->getcode(start, (int)namelen,
6498 &ch, 0)) {
6499 assert(ch <= MAX_UNICODE);
6500 WRITE_CHAR(ch);
6501 continue;
6502 }
6503 message = "unknown Unicode character name";
6504 }
6505 }
6506 goto error;
6507
6508 default:
6509 if (*first_invalid_escape_char == -1) {
6510 *first_invalid_escape_char = c;
6511 if (starts == initial_starts) {
6512 /* Back up one char, since we've already incremented s. */
6513 *first_invalid_escape_ptr = s - 1;
6514 }
6515 }
6516 WRITE_ASCII_CHAR('\\');
6517 WRITE_CHAR(c);
6518 continue;
6519 }
6520
6521 incomplete:
6522 if (consumed) {
6523 *consumed = startinpos;
6524 break;
6525 }
6526 error:;
6527 Py_ssize_t endinpos = s-starts;
6528 writer.min_length = end - s + writer.pos;
6529 if (unicode_decode_call_errorhandler_writer(
6530 errors, &errorHandler,
6531 "unicodeescape", message,
6532 &starts, &end, &startinpos, &endinpos, &exc, &s,
6533 &writer)) {
6534 goto onError;
6535 }
6536 assert(end - s <= writer.size - writer.pos);
6537
6538 #undef WRITE_ASCII_CHAR
6539 #undef WRITE_CHAR
6540 }
6541
6542 Py_XDECREF(errorHandler);
6543 Py_XDECREF(exc);
6544 return _PyUnicodeWriter_Finish(&writer);
6545
6546 onError:
6547 _PyUnicodeWriter_Dealloc(&writer);
6548 Py_XDECREF(errorHandler);
6549 Py_XDECREF(exc);
6550 return NULL;
6551 }
6552
6553 // Export for binary compatibility.
6554 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6555 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6556 Py_ssize_t size,
6557 const char *errors,
6558 Py_ssize_t *consumed,
6559 const char **first_invalid_escape)
6560 {
6561 int first_invalid_escape_char;
6562 return _PyUnicode_DecodeUnicodeEscapeInternal2(
6563 s, size, errors, consumed,
6564 &first_invalid_escape_char,
6565 first_invalid_escape);
6566 }
6567
6568 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6569 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6570 Py_ssize_t size,
6571 const char *errors,
6572 Py_ssize_t *consumed)
6573 {
6574 int first_invalid_escape_char;
6575 const char *first_invalid_escape_ptr;
6576 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6577 consumed,
6578 &first_invalid_escape_char,
6579 &first_invalid_escape_ptr);
6580 if (result == NULL)
6581 return NULL;
6582 if (first_invalid_escape_char != -1) {
6583 if (first_invalid_escape_char > 0xff) {
6584 char buf[12] = "";
6585 snprintf(buf, sizeof buf, "%o", first_invalid_escape_char);
6586 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6587 "invalid octal escape sequence '\\%s'",
6588 buf) < 0)
6589 {
6590 Py_DECREF(result);
6591 return NULL;
6592 }
6593 }
6594 else {
6595 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6596 "invalid escape sequence '\\%c'",
6597 first_invalid_escape_char) < 0)
6598 {
6599 Py_DECREF(result);
6600 return NULL;
6601 }
6602 }
6603 }
6604 return result;
6605 }
6606
6607 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6608 PyUnicode_DecodeUnicodeEscape(const char *s,
6609 Py_ssize_t size,
6610 const char *errors)
6611 {
6612 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6613 }
6614
6615 /* Return a Unicode-Escape string version of the Unicode object. */
6616
6617 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6618 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6619 {
6620 Py_ssize_t i, len;
6621 PyObject *repr;
6622 char *p;
6623 enum PyUnicode_Kind kind;
6624 const void *data;
6625 Py_ssize_t expandsize;
6626
6627 /* Initial allocation is based on the longest-possible character
6628 escape.
6629
6630 For UCS1 strings it's '\xxx', 4 bytes per source character.
6631 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6632 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6633 */
6634
6635 if (!PyUnicode_Check(unicode)) {
6636 PyErr_BadArgument();
6637 return NULL;
6638 }
6639 if (PyUnicode_READY(unicode) == -1) {
6640 return NULL;
6641 }
6642
6643 len = PyUnicode_GET_LENGTH(unicode);
6644 if (len == 0) {
6645 return PyBytes_FromStringAndSize(NULL, 0);
6646 }
6647
6648 kind = PyUnicode_KIND(unicode);
6649 data = PyUnicode_DATA(unicode);
6650 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6651 bytes, and 1 byte characters 4. */
6652 expandsize = kind * 2 + 2;
6653 if (len > PY_SSIZE_T_MAX / expandsize) {
6654 return PyErr_NoMemory();
6655 }
6656 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6657 if (repr == NULL) {
6658 return NULL;
6659 }
6660
6661 p = PyBytes_AS_STRING(repr);
6662 for (i = 0; i < len; i++) {
6663 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6664
6665 /* U+0000-U+00ff range */
6666 if (ch < 0x100) {
6667 if (ch >= ' ' && ch < 127) {
6668 if (ch != '\\') {
6669 /* Copy printable US ASCII as-is */
6670 *p++ = (char) ch;
6671 }
6672 /* Escape backslashes */
6673 else {
6674 *p++ = '\\';
6675 *p++ = '\\';
6676 }
6677 }
6678
6679 /* Map special whitespace to '\t', \n', '\r' */
6680 else if (ch == '\t') {
6681 *p++ = '\\';
6682 *p++ = 't';
6683 }
6684 else if (ch == '\n') {
6685 *p++ = '\\';
6686 *p++ = 'n';
6687 }
6688 else if (ch == '\r') {
6689 *p++ = '\\';
6690 *p++ = 'r';
6691 }
6692
6693 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6694 else {
6695 *p++ = '\\';
6696 *p++ = 'x';
6697 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6698 *p++ = Py_hexdigits[ch & 0x000F];
6699 }
6700 }
6701 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6702 else if (ch < 0x10000) {
6703 *p++ = '\\';
6704 *p++ = 'u';
6705 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6706 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6707 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6708 *p++ = Py_hexdigits[ch & 0x000F];
6709 }
6710 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6711 else {
6712
6713 /* Make sure that the first two digits are zero */
6714 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6715 *p++ = '\\';
6716 *p++ = 'U';
6717 *p++ = '0';
6718 *p++ = '0';
6719 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6720 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6721 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6722 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6723 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6724 *p++ = Py_hexdigits[ch & 0x0000000F];
6725 }
6726 }
6727
6728 assert(p - PyBytes_AS_STRING(repr) > 0);
6729 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6730 return NULL;
6731 }
6732 return repr;
6733 }
6734
6735 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6736
6737 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6738 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6739 Py_ssize_t size,
6740 const char *errors,
6741 Py_ssize_t *consumed)
6742 {
6743 const char *starts = s;
6744 _PyUnicodeWriter writer;
6745 const char *end;
6746 PyObject *errorHandler = NULL;
6747 PyObject *exc = NULL;
6748
6749 if (size == 0) {
6750 if (consumed) {
6751 *consumed = 0;
6752 }
6753 _Py_RETURN_UNICODE_EMPTY();
6754 }
6755
6756 /* Escaped strings will always be longer than the resulting
6757 Unicode string, so we start with size here and then reduce the
6758 length after conversion to the true value. (But decoding error
6759 handler might have to resize the string) */
6760 _PyUnicodeWriter_Init(&writer);
6761 writer.min_length = size;
6762 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6763 goto onError;
6764 }
6765
6766 end = s + size;
6767 while (s < end) {
6768 unsigned char c = (unsigned char) *s++;
6769 Py_UCS4 ch;
6770 int count;
6771 const char *message;
6772
6773 #define WRITE_CHAR(ch) \
6774 do { \
6775 if (ch <= writer.maxchar) { \
6776 assert(writer.pos < writer.size); \
6777 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6778 } \
6779 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6780 goto onError; \
6781 } \
6782 } while(0)
6783
6784 /* Non-escape characters are interpreted as Unicode ordinals */
6785 if (c != '\\' || (s >= end && !consumed)) {
6786 WRITE_CHAR(c);
6787 continue;
6788 }
6789
6790 Py_ssize_t startinpos = s - starts - 1;
6791 /* \ - Escapes */
6792 if (s >= end) {
6793 assert(consumed);
6794 // Set message to silent compiler warning.
6795 // Actually it is never used.
6796 message = "\\ at end of string";
6797 goto incomplete;
6798 }
6799
6800 c = (unsigned char) *s++;
6801 if (c == 'u') {
6802 count = 4;
6803 message = "truncated \\uXXXX escape";
6804 }
6805 else if (c == 'U') {
6806 count = 8;
6807 message = "truncated \\UXXXXXXXX escape";
6808 }
6809 else {
6810 assert(writer.pos < writer.size);
6811 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6812 WRITE_CHAR(c);
6813 continue;
6814 }
6815
6816 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6817 for (ch = 0; count; ++s, --count) {
6818 if (s >= end) {
6819 goto incomplete;
6820 }
6821 c = (unsigned char)*s;
6822 ch <<= 4;
6823 if (c >= '0' && c <= '9') {
6824 ch += c - '0';
6825 }
6826 else if (c >= 'a' && c <= 'f') {
6827 ch += c - ('a' - 10);
6828 }
6829 else if (c >= 'A' && c <= 'F') {
6830 ch += c - ('A' - 10);
6831 }
6832 else {
6833 goto error;
6834 }
6835 }
6836 if (ch > MAX_UNICODE) {
6837 message = "\\Uxxxxxxxx out of range";
6838 goto error;
6839 }
6840 WRITE_CHAR(ch);
6841 continue;
6842
6843 incomplete:
6844 if (consumed) {
6845 *consumed = startinpos;
6846 break;
6847 }
6848 error:;
6849 Py_ssize_t endinpos = s-starts;
6850 writer.min_length = end - s + writer.pos;
6851 if (unicode_decode_call_errorhandler_writer(
6852 errors, &errorHandler,
6853 "rawunicodeescape", message,
6854 &starts, &end, &startinpos, &endinpos, &exc, &s,
6855 &writer)) {
6856 goto onError;
6857 }
6858 assert(end - s <= writer.size - writer.pos);
6859
6860 #undef WRITE_CHAR
6861 }
6862 Py_XDECREF(errorHandler);
6863 Py_XDECREF(exc);
6864 return _PyUnicodeWriter_Finish(&writer);
6865
6866 onError:
6867 _PyUnicodeWriter_Dealloc(&writer);
6868 Py_XDECREF(errorHandler);
6869 Py_XDECREF(exc);
6870 return NULL;
6871 }
6872
6873 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6874 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6875 Py_ssize_t size,
6876 const char *errors)
6877 {
6878 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6879 }
6880
6881
6882 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6883 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6884 {
6885 PyObject *repr;
6886 char *p;
6887 Py_ssize_t expandsize, pos;
6888 int kind;
6889 const void *data;
6890 Py_ssize_t len;
6891
6892 if (!PyUnicode_Check(unicode)) {
6893 PyErr_BadArgument();
6894 return NULL;
6895 }
6896 if (PyUnicode_READY(unicode) == -1) {
6897 return NULL;
6898 }
6899 kind = PyUnicode_KIND(unicode);
6900 data = PyUnicode_DATA(unicode);
6901 len = PyUnicode_GET_LENGTH(unicode);
6902 if (kind == PyUnicode_1BYTE_KIND) {
6903 return PyBytes_FromStringAndSize(data, len);
6904 }
6905
6906 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6907 bytes, and 1 byte characters 4. */
6908 expandsize = kind * 2 + 2;
6909
6910 if (len > PY_SSIZE_T_MAX / expandsize) {
6911 return PyErr_NoMemory();
6912 }
6913 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6914 if (repr == NULL) {
6915 return NULL;
6916 }
6917 if (len == 0) {
6918 return repr;
6919 }
6920
6921 p = PyBytes_AS_STRING(repr);
6922 for (pos = 0; pos < len; pos++) {
6923 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6924
6925 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6926 if (ch < 0x100) {
6927 *p++ = (char) ch;
6928 }
6929 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6930 else if (ch < 0x10000) {
6931 *p++ = '\\';
6932 *p++ = 'u';
6933 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6934 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6935 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6936 *p++ = Py_hexdigits[ch & 15];
6937 }
6938 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6939 else {
6940 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6941 *p++ = '\\';
6942 *p++ = 'U';
6943 *p++ = '0';
6944 *p++ = '0';
6945 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6946 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6947 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6948 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6949 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6950 *p++ = Py_hexdigits[ch & 15];
6951 }
6952 }
6953
6954 assert(p > PyBytes_AS_STRING(repr));
6955 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6956 return NULL;
6957 }
6958 return repr;
6959 }
6960
6961 /* --- Latin-1 Codec ------------------------------------------------------ */
6962
6963 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6964 PyUnicode_DecodeLatin1(const char *s,
6965 Py_ssize_t size,
6966 const char *errors)
6967 {
6968 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6969 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6970 }
6971
6972 /* create or adjust a UnicodeEncodeError */
6973 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6974 make_encode_exception(PyObject **exceptionObject,
6975 const char *encoding,
6976 PyObject *unicode,
6977 Py_ssize_t startpos, Py_ssize_t endpos,
6978 const char *reason)
6979 {
6980 if (*exceptionObject == NULL) {
6981 *exceptionObject = PyObject_CallFunction(
6982 PyExc_UnicodeEncodeError, "sOnns",
6983 encoding, unicode, startpos, endpos, reason);
6984 }
6985 else {
6986 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6987 goto onError;
6988 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6989 goto onError;
6990 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6991 goto onError;
6992 return;
6993 onError:
6994 Py_CLEAR(*exceptionObject);
6995 }
6996 }
6997
6998 /* raises a UnicodeEncodeError */
6999 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7000 raise_encode_exception(PyObject **exceptionObject,
7001 const char *encoding,
7002 PyObject *unicode,
7003 Py_ssize_t startpos, Py_ssize_t endpos,
7004 const char *reason)
7005 {
7006 make_encode_exception(exceptionObject,
7007 encoding, unicode, startpos, endpos, reason);
7008 if (*exceptionObject != NULL)
7009 PyCodec_StrictErrors(*exceptionObject);
7010 }
7011
7012 /* error handling callback helper:
7013 build arguments, call the callback and check the arguments,
7014 put the result into newpos and return the replacement string, which
7015 has to be freed by the caller */
7016 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)7017 unicode_encode_call_errorhandler(const char *errors,
7018 PyObject **errorHandler,
7019 const char *encoding, const char *reason,
7020 PyObject *unicode, PyObject **exceptionObject,
7021 Py_ssize_t startpos, Py_ssize_t endpos,
7022 Py_ssize_t *newpos)
7023 {
7024 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7025 Py_ssize_t len;
7026 PyObject *restuple;
7027 PyObject *resunicode;
7028
7029 if (*errorHandler == NULL) {
7030 *errorHandler = PyCodec_LookupError(errors);
7031 if (*errorHandler == NULL)
7032 return NULL;
7033 }
7034
7035 if (PyUnicode_READY(unicode) == -1)
7036 return NULL;
7037 len = PyUnicode_GET_LENGTH(unicode);
7038
7039 make_encode_exception(exceptionObject,
7040 encoding, unicode, startpos, endpos, reason);
7041 if (*exceptionObject == NULL)
7042 return NULL;
7043
7044 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7045 if (restuple == NULL)
7046 return NULL;
7047 if (!PyTuple_Check(restuple)) {
7048 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7049 Py_DECREF(restuple);
7050 return NULL;
7051 }
7052 if (!PyArg_ParseTuple(restuple, argparse,
7053 &resunicode, newpos)) {
7054 Py_DECREF(restuple);
7055 return NULL;
7056 }
7057 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7058 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7059 Py_DECREF(restuple);
7060 return NULL;
7061 }
7062 if (*newpos<0)
7063 *newpos = len + *newpos;
7064 if (*newpos<0 || *newpos>len) {
7065 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7066 Py_DECREF(restuple);
7067 return NULL;
7068 }
7069 Py_INCREF(resunicode);
7070 Py_DECREF(restuple);
7071 return resunicode;
7072 }
7073
7074 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7075 unicode_encode_ucs1(PyObject *unicode,
7076 const char *errors,
7077 const Py_UCS4 limit)
7078 {
7079 /* input state */
7080 Py_ssize_t pos=0, size;
7081 int kind;
7082 const void *data;
7083 /* pointer into the output */
7084 char *str;
7085 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7086 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7087 PyObject *error_handler_obj = NULL;
7088 PyObject *exc = NULL;
7089 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7090 PyObject *rep = NULL;
7091 /* output object */
7092 _PyBytesWriter writer;
7093
7094 if (PyUnicode_READY(unicode) == -1)
7095 return NULL;
7096 size = PyUnicode_GET_LENGTH(unicode);
7097 kind = PyUnicode_KIND(unicode);
7098 data = PyUnicode_DATA(unicode);
7099 /* allocate enough for a simple encoding without
7100 replacements, if we need more, we'll resize */
7101 if (size == 0)
7102 return PyBytes_FromStringAndSize(NULL, 0);
7103
7104 _PyBytesWriter_Init(&writer);
7105 str = _PyBytesWriter_Alloc(&writer, size);
7106 if (str == NULL)
7107 return NULL;
7108
7109 while (pos < size) {
7110 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7111
7112 /* can we encode this? */
7113 if (ch < limit) {
7114 /* no overflow check, because we know that the space is enough */
7115 *str++ = (char)ch;
7116 ++pos;
7117 }
7118 else {
7119 Py_ssize_t newpos, i;
7120 /* startpos for collecting unencodable chars */
7121 Py_ssize_t collstart = pos;
7122 Py_ssize_t collend = collstart + 1;
7123 /* find all unecodable characters */
7124
7125 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7126 ++collend;
7127
7128 /* Only overallocate the buffer if it's not the last write */
7129 writer.overallocate = (collend < size);
7130
7131 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7132 if (error_handler == _Py_ERROR_UNKNOWN)
7133 error_handler = _Py_GetErrorHandler(errors);
7134
7135 switch (error_handler) {
7136 case _Py_ERROR_STRICT:
7137 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7138 goto onError;
7139
7140 case _Py_ERROR_REPLACE:
7141 memset(str, '?', collend - collstart);
7142 str += (collend - collstart);
7143 /* fall through */
7144 case _Py_ERROR_IGNORE:
7145 pos = collend;
7146 break;
7147
7148 case _Py_ERROR_BACKSLASHREPLACE:
7149 /* subtract preallocated bytes */
7150 writer.min_size -= (collend - collstart);
7151 str = backslashreplace(&writer, str,
7152 unicode, collstart, collend);
7153 if (str == NULL)
7154 goto onError;
7155 pos = collend;
7156 break;
7157
7158 case _Py_ERROR_XMLCHARREFREPLACE:
7159 /* subtract preallocated bytes */
7160 writer.min_size -= (collend - collstart);
7161 str = xmlcharrefreplace(&writer, str,
7162 unicode, collstart, collend);
7163 if (str == NULL)
7164 goto onError;
7165 pos = collend;
7166 break;
7167
7168 case _Py_ERROR_SURROGATEESCAPE:
7169 for (i = collstart; i < collend; ++i) {
7170 ch = PyUnicode_READ(kind, data, i);
7171 if (ch < 0xdc80 || 0xdcff < ch) {
7172 /* Not a UTF-8b surrogate */
7173 break;
7174 }
7175 *str++ = (char)(ch - 0xdc00);
7176 ++pos;
7177 }
7178 if (i >= collend)
7179 break;
7180 collstart = pos;
7181 assert(collstart != collend);
7182 /* fall through */
7183
7184 default:
7185 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7186 encoding, reason, unicode, &exc,
7187 collstart, collend, &newpos);
7188 if (rep == NULL)
7189 goto onError;
7190
7191 if (newpos < collstart) {
7192 writer.overallocate = 1;
7193 str = _PyBytesWriter_Prepare(&writer, str,
7194 collstart - newpos);
7195 if (str == NULL)
7196 goto onError;
7197 }
7198 else {
7199 /* subtract preallocated bytes */
7200 writer.min_size -= newpos - collstart;
7201 /* Only overallocate the buffer if it's not the last write */
7202 writer.overallocate = (newpos < size);
7203 }
7204
7205 if (PyBytes_Check(rep)) {
7206 /* Directly copy bytes result to output. */
7207 str = _PyBytesWriter_WriteBytes(&writer, str,
7208 PyBytes_AS_STRING(rep),
7209 PyBytes_GET_SIZE(rep));
7210 }
7211 else {
7212 assert(PyUnicode_Check(rep));
7213
7214 if (PyUnicode_READY(rep) < 0)
7215 goto onError;
7216
7217 if (limit == 256 ?
7218 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7219 !PyUnicode_IS_ASCII(rep))
7220 {
7221 /* Not all characters are smaller than limit */
7222 raise_encode_exception(&exc, encoding, unicode,
7223 collstart, collend, reason);
7224 goto onError;
7225 }
7226 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7227 str = _PyBytesWriter_WriteBytes(&writer, str,
7228 PyUnicode_DATA(rep),
7229 PyUnicode_GET_LENGTH(rep));
7230 }
7231 if (str == NULL)
7232 goto onError;
7233
7234 pos = newpos;
7235 Py_CLEAR(rep);
7236 }
7237
7238 /* If overallocation was disabled, ensure that it was the last
7239 write. Otherwise, we missed an optimization */
7240 assert(writer.overallocate || pos == size);
7241 }
7242 }
7243
7244 Py_XDECREF(error_handler_obj);
7245 Py_XDECREF(exc);
7246 return _PyBytesWriter_Finish(&writer, str);
7247
7248 onError:
7249 Py_XDECREF(rep);
7250 _PyBytesWriter_Dealloc(&writer);
7251 Py_XDECREF(error_handler_obj);
7252 Py_XDECREF(exc);
7253 return NULL;
7254 }
7255
7256 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7257 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7258 {
7259 if (!PyUnicode_Check(unicode)) {
7260 PyErr_BadArgument();
7261 return NULL;
7262 }
7263 if (PyUnicode_READY(unicode) == -1)
7264 return NULL;
7265 /* Fast path: if it is a one-byte string, construct
7266 bytes object directly. */
7267 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7268 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7269 PyUnicode_GET_LENGTH(unicode));
7270 /* Non-Latin-1 characters present. Defer to above function to
7271 raise the exception. */
7272 return unicode_encode_ucs1(unicode, errors, 256);
7273 }
7274
7275 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7276 PyUnicode_AsLatin1String(PyObject *unicode)
7277 {
7278 return _PyUnicode_AsLatin1String(unicode, NULL);
7279 }
7280
7281 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7282
7283 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7284 PyUnicode_DecodeASCII(const char *s,
7285 Py_ssize_t size,
7286 const char *errors)
7287 {
7288 const char *starts = s;
7289 const char *e = s + size;
7290 PyObject *error_handler_obj = NULL;
7291 PyObject *exc = NULL;
7292 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7293
7294 if (size == 0)
7295 _Py_RETURN_UNICODE_EMPTY();
7296
7297 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7298 if (size == 1 && (unsigned char)s[0] < 128) {
7299 return get_latin1_char((unsigned char)s[0]);
7300 }
7301
7302 // Shortcut for simple case
7303 PyObject *u = PyUnicode_New(size, 127);
7304 if (u == NULL) {
7305 return NULL;
7306 }
7307 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7308 if (outpos == size) {
7309 return u;
7310 }
7311
7312 _PyUnicodeWriter writer;
7313 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7314 writer.pos = outpos;
7315
7316 s += outpos;
7317 int kind = writer.kind;
7318 void *data = writer.data;
7319 Py_ssize_t startinpos, endinpos;
7320
7321 while (s < e) {
7322 unsigned char c = (unsigned char)*s;
7323 if (c < 128) {
7324 PyUnicode_WRITE(kind, data, writer.pos, c);
7325 writer.pos++;
7326 ++s;
7327 continue;
7328 }
7329
7330 /* byte outsize range 0x00..0x7f: call the error handler */
7331
7332 if (error_handler == _Py_ERROR_UNKNOWN)
7333 error_handler = _Py_GetErrorHandler(errors);
7334
7335 switch (error_handler)
7336 {
7337 case _Py_ERROR_REPLACE:
7338 case _Py_ERROR_SURROGATEESCAPE:
7339 /* Fast-path: the error handler only writes one character,
7340 but we may switch to UCS2 at the first write */
7341 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7342 goto onError;
7343 kind = writer.kind;
7344 data = writer.data;
7345
7346 if (error_handler == _Py_ERROR_REPLACE)
7347 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7348 else
7349 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7350 writer.pos++;
7351 ++s;
7352 break;
7353
7354 case _Py_ERROR_IGNORE:
7355 ++s;
7356 break;
7357
7358 default:
7359 startinpos = s-starts;
7360 endinpos = startinpos + 1;
7361 if (unicode_decode_call_errorhandler_writer(
7362 errors, &error_handler_obj,
7363 "ascii", "ordinal not in range(128)",
7364 &starts, &e, &startinpos, &endinpos, &exc, &s,
7365 &writer))
7366 goto onError;
7367 kind = writer.kind;
7368 data = writer.data;
7369 }
7370 }
7371 Py_XDECREF(error_handler_obj);
7372 Py_XDECREF(exc);
7373 return _PyUnicodeWriter_Finish(&writer);
7374
7375 onError:
7376 _PyUnicodeWriter_Dealloc(&writer);
7377 Py_XDECREF(error_handler_obj);
7378 Py_XDECREF(exc);
7379 return NULL;
7380 }
7381
7382 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7383 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7384 {
7385 if (!PyUnicode_Check(unicode)) {
7386 PyErr_BadArgument();
7387 return NULL;
7388 }
7389 if (PyUnicode_READY(unicode) == -1)
7390 return NULL;
7391 /* Fast path: if it is an ASCII-only string, construct bytes object
7392 directly. Else defer to above function to raise the exception. */
7393 if (PyUnicode_IS_ASCII(unicode))
7394 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7395 PyUnicode_GET_LENGTH(unicode));
7396 return unicode_encode_ucs1(unicode, errors, 128);
7397 }
7398
7399 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7400 PyUnicode_AsASCIIString(PyObject *unicode)
7401 {
7402 return _PyUnicode_AsASCIIString(unicode, NULL);
7403 }
7404
7405 #ifdef MS_WINDOWS
7406
7407 /* --- MBCS codecs for Windows -------------------------------------------- */
7408
7409 #if SIZEOF_INT < SIZEOF_SIZE_T
7410 #define NEED_RETRY
7411 #endif
7412
7413 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7414 transcoding from UTF-16), but INT_MAX / 4 performs better in
7415 both cases also and avoids partial characters overrunning the
7416 length limit in MultiByteToWideChar on Windows */
7417 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7418
7419 #ifndef WC_ERR_INVALID_CHARS
7420 # define WC_ERR_INVALID_CHARS 0x0080
7421 #endif
7422
7423 static const char*
code_page_name(UINT code_page,PyObject ** obj)7424 code_page_name(UINT code_page, PyObject **obj)
7425 {
7426 *obj = NULL;
7427 if (code_page == CP_ACP)
7428 return "mbcs";
7429 if (code_page == CP_UTF7)
7430 return "CP_UTF7";
7431 if (code_page == CP_UTF8)
7432 return "CP_UTF8";
7433
7434 *obj = PyBytes_FromFormat("cp%u", code_page);
7435 if (*obj == NULL)
7436 return NULL;
7437 return PyBytes_AS_STRING(*obj);
7438 }
7439
7440 static DWORD
decode_code_page_flags(UINT code_page)7441 decode_code_page_flags(UINT code_page)
7442 {
7443 if (code_page == CP_UTF7) {
7444 /* The CP_UTF7 decoder only supports flags=0 */
7445 return 0;
7446 }
7447 else
7448 return MB_ERR_INVALID_CHARS;
7449 }
7450
7451 /*
7452 * Decode a byte string from a Windows code page into unicode object in strict
7453 * mode.
7454 *
7455 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7456 * OSError and returns -1 on other error.
7457 */
7458 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7459 decode_code_page_strict(UINT code_page,
7460 wchar_t **buf,
7461 Py_ssize_t *bufsize,
7462 const char *in,
7463 int insize)
7464 {
7465 DWORD flags = MB_ERR_INVALID_CHARS;
7466 wchar_t *out;
7467 DWORD outsize;
7468
7469 /* First get the size of the result */
7470 assert(insize > 0);
7471 while ((outsize = MultiByteToWideChar(code_page, flags,
7472 in, insize, NULL, 0)) <= 0)
7473 {
7474 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7475 goto error;
7476 }
7477 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7478 flags = 0;
7479 }
7480
7481 /* Extend a wchar_t* buffer */
7482 Py_ssize_t n = *bufsize; /* Get the current length */
7483 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7484 return -1;
7485 }
7486 out = *buf + n;
7487
7488 /* Do the conversion */
7489 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7490 if (outsize <= 0)
7491 goto error;
7492 return insize;
7493
7494 error:
7495 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7496 return -2;
7497 PyErr_SetFromWindowsErr(0);
7498 return -1;
7499 }
7500
7501 /*
7502 * Decode a byte string from a code page into unicode object with an error
7503 * handler.
7504 *
7505 * Returns consumed size if succeed, or raise an OSError or
7506 * UnicodeDecodeError exception and returns -1 on error.
7507 */
7508 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7509 decode_code_page_errors(UINT code_page,
7510 wchar_t **buf,
7511 Py_ssize_t *bufsize,
7512 const char *in, const int size,
7513 const char *errors, int final)
7514 {
7515 const char *startin = in;
7516 const char *endin = in + size;
7517 DWORD flags = MB_ERR_INVALID_CHARS;
7518 /* Ideally, we should get reason from FormatMessage. This is the Windows
7519 2000 English version of the message. */
7520 const char *reason = "No mapping for the Unicode character exists "
7521 "in the target code page.";
7522 /* each step cannot decode more than 1 character, but a character can be
7523 represented as a surrogate pair */
7524 wchar_t buffer[2], *out;
7525 int insize;
7526 Py_ssize_t outsize;
7527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
7530 const char *encoding;
7531 DWORD err;
7532 int ret = -1;
7533
7534 assert(size > 0);
7535
7536 encoding = code_page_name(code_page, &encoding_obj);
7537 if (encoding == NULL)
7538 return -1;
7539
7540 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7541 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7542 UnicodeDecodeError. */
7543 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7544 if (exc != NULL) {
7545 PyCodec_StrictErrors(exc);
7546 Py_CLEAR(exc);
7547 }
7548 goto error;
7549 }
7550
7551 /* Extend a wchar_t* buffer */
7552 Py_ssize_t n = *bufsize; /* Get the current length */
7553 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7554 PyErr_NoMemory();
7555 goto error;
7556 }
7557 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7558 goto error;
7559 }
7560 out = *buf + n;
7561
7562 /* Decode the byte string character per character */
7563 while (in < endin)
7564 {
7565 /* Decode a character */
7566 insize = 1;
7567 do
7568 {
7569 outsize = MultiByteToWideChar(code_page, flags,
7570 in, insize,
7571 buffer, Py_ARRAY_LENGTH(buffer));
7572 if (outsize > 0)
7573 break;
7574 err = GetLastError();
7575 if (err == ERROR_INVALID_FLAGS && flags) {
7576 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7577 flags = 0;
7578 continue;
7579 }
7580 if (err != ERROR_NO_UNICODE_TRANSLATION
7581 && err != ERROR_INSUFFICIENT_BUFFER)
7582 {
7583 PyErr_SetFromWindowsErr(0);
7584 goto error;
7585 }
7586 insize++;
7587 }
7588 /* 4=maximum length of a UTF-8 sequence */
7589 while (insize <= 4 && (in + insize) <= endin);
7590
7591 if (outsize <= 0) {
7592 Py_ssize_t startinpos, endinpos, outpos;
7593
7594 /* last character in partial decode? */
7595 if (in + insize >= endin && !final)
7596 break;
7597
7598 startinpos = in - startin;
7599 endinpos = startinpos + 1;
7600 outpos = out - *buf;
7601 if (unicode_decode_call_errorhandler_wchar(
7602 errors, &errorHandler,
7603 encoding, reason,
7604 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7605 buf, bufsize, &outpos))
7606 {
7607 goto error;
7608 }
7609 out = *buf + outpos;
7610 }
7611 else {
7612 in += insize;
7613 memcpy(out, buffer, outsize * sizeof(wchar_t));
7614 out += outsize;
7615 }
7616 }
7617
7618 /* Shrink the buffer */
7619 assert(out - *buf <= *bufsize);
7620 *bufsize = out - *buf;
7621 /* (in - startin) <= size and size is an int */
7622 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7623
7624 error:
7625 Py_XDECREF(encoding_obj);
7626 Py_XDECREF(errorHandler);
7627 Py_XDECREF(exc);
7628 return ret;
7629 }
7630
7631 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7632 decode_code_page_stateful(int code_page,
7633 const char *s, Py_ssize_t size,
7634 const char *errors, Py_ssize_t *consumed)
7635 {
7636 wchar_t *buf = NULL;
7637 Py_ssize_t bufsize = 0;
7638 int chunk_size, final, converted, done;
7639
7640 if (code_page < 0) {
7641 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7642 return NULL;
7643 }
7644 if (size < 0) {
7645 PyErr_BadInternalCall();
7646 return NULL;
7647 }
7648
7649 if (consumed)
7650 *consumed = 0;
7651
7652 do
7653 {
7654 #ifdef NEED_RETRY
7655 if (size > DECODING_CHUNK_SIZE) {
7656 chunk_size = DECODING_CHUNK_SIZE;
7657 final = 0;
7658 done = 0;
7659 }
7660 else
7661 #endif
7662 {
7663 chunk_size = (int)size;
7664 final = (consumed == NULL);
7665 done = 1;
7666 }
7667
7668 if (chunk_size == 0 && done) {
7669 if (buf != NULL)
7670 break;
7671 _Py_RETURN_UNICODE_EMPTY();
7672 }
7673
7674 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7675 s, chunk_size);
7676 if (converted == -2)
7677 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7678 s, chunk_size,
7679 errors, final);
7680 assert(converted != 0 || done);
7681
7682 if (converted < 0) {
7683 PyMem_Free(buf);
7684 return NULL;
7685 }
7686
7687 if (consumed)
7688 *consumed += converted;
7689
7690 s += converted;
7691 size -= converted;
7692 } while (!done);
7693
7694 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7695 PyMem_Free(buf);
7696 return v;
7697 }
7698
7699 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7700 PyUnicode_DecodeCodePageStateful(int code_page,
7701 const char *s,
7702 Py_ssize_t size,
7703 const char *errors,
7704 Py_ssize_t *consumed)
7705 {
7706 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7707 }
7708
7709 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7710 PyUnicode_DecodeMBCSStateful(const char *s,
7711 Py_ssize_t size,
7712 const char *errors,
7713 Py_ssize_t *consumed)
7714 {
7715 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7716 }
7717
7718 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7719 PyUnicode_DecodeMBCS(const char *s,
7720 Py_ssize_t size,
7721 const char *errors)
7722 {
7723 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7724 }
7725
7726 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7727 encode_code_page_flags(UINT code_page, const char *errors)
7728 {
7729 if (code_page == CP_UTF8) {
7730 return WC_ERR_INVALID_CHARS;
7731 }
7732 else if (code_page == CP_UTF7) {
7733 /* CP_UTF7 only supports flags=0 */
7734 return 0;
7735 }
7736 else {
7737 if (errors != NULL && strcmp(errors, "replace") == 0)
7738 return 0;
7739 else
7740 return WC_NO_BEST_FIT_CHARS;
7741 }
7742 }
7743
7744 /*
7745 * Encode a Unicode string to a Windows code page into a byte string in strict
7746 * mode.
7747 *
7748 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7749 * an OSError and returns -1 on other error.
7750 */
7751 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7752 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7753 PyObject *unicode, Py_ssize_t offset, int len,
7754 const char* errors)
7755 {
7756 BOOL usedDefaultChar = FALSE;
7757 BOOL *pusedDefaultChar = &usedDefaultChar;
7758 int outsize;
7759 wchar_t *p;
7760 Py_ssize_t size;
7761 const DWORD flags = encode_code_page_flags(code_page, NULL);
7762 char *out;
7763 /* Create a substring so that we can get the UTF-16 representation
7764 of just the slice under consideration. */
7765 PyObject *substring;
7766 int ret = -1;
7767
7768 assert(len > 0);
7769
7770 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7771 pusedDefaultChar = &usedDefaultChar;
7772 else
7773 pusedDefaultChar = NULL;
7774
7775 substring = PyUnicode_Substring(unicode, offset, offset+len);
7776 if (substring == NULL)
7777 return -1;
7778 #if USE_UNICODE_WCHAR_CACHE
7779 _Py_COMP_DIAG_PUSH
7780 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7781 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7782 if (p == NULL) {
7783 Py_DECREF(substring);
7784 return -1;
7785 }
7786 _Py_COMP_DIAG_POP
7787 #else /* USE_UNICODE_WCHAR_CACHE */
7788 p = PyUnicode_AsWideCharString(substring, &size);
7789 Py_CLEAR(substring);
7790 if (p == NULL) {
7791 return -1;
7792 }
7793 #endif /* USE_UNICODE_WCHAR_CACHE */
7794 assert(size <= INT_MAX);
7795
7796 /* First get the size of the result */
7797 outsize = WideCharToMultiByte(code_page, flags,
7798 p, (int)size,
7799 NULL, 0,
7800 NULL, pusedDefaultChar);
7801 if (outsize <= 0)
7802 goto error;
7803 /* If we used a default char, then we failed! */
7804 if (pusedDefaultChar && *pusedDefaultChar) {
7805 ret = -2;
7806 goto done;
7807 }
7808
7809 if (*outbytes == NULL) {
7810 /* Create string object */
7811 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7812 if (*outbytes == NULL) {
7813 goto done;
7814 }
7815 out = PyBytes_AS_STRING(*outbytes);
7816 }
7817 else {
7818 /* Extend string object */
7819 const Py_ssize_t n = PyBytes_Size(*outbytes);
7820 if (outsize > PY_SSIZE_T_MAX - n) {
7821 PyErr_NoMemory();
7822 goto done;
7823 }
7824 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7825 goto done;
7826 }
7827 out = PyBytes_AS_STRING(*outbytes) + n;
7828 }
7829
7830 /* Do the conversion */
7831 outsize = WideCharToMultiByte(code_page, flags,
7832 p, (int)size,
7833 out, outsize,
7834 NULL, pusedDefaultChar);
7835 if (outsize <= 0)
7836 goto error;
7837 if (pusedDefaultChar && *pusedDefaultChar) {
7838 ret = -2;
7839 goto done;
7840 }
7841 ret = 0;
7842
7843 done:
7844 #if USE_UNICODE_WCHAR_CACHE
7845 Py_DECREF(substring);
7846 #else /* USE_UNICODE_WCHAR_CACHE */
7847 PyMem_Free(p);
7848 #endif /* USE_UNICODE_WCHAR_CACHE */
7849 return ret;
7850
7851 error:
7852 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7853 ret = -2;
7854 goto done;
7855 }
7856 PyErr_SetFromWindowsErr(0);
7857 goto done;
7858 }
7859
7860 /*
7861 * Encode a Unicode string to a Windows code page into a byte string using an
7862 * error handler.
7863 *
7864 * Returns consumed characters if succeed, or raise an OSError and returns
7865 * -1 on other error.
7866 */
7867 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7868 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7869 PyObject *unicode, Py_ssize_t unicode_offset,
7870 Py_ssize_t insize, const char* errors)
7871 {
7872 const DWORD flags = encode_code_page_flags(code_page, errors);
7873 Py_ssize_t pos = unicode_offset;
7874 Py_ssize_t endin = unicode_offset + insize;
7875 /* Ideally, we should get reason from FormatMessage. This is the Windows
7876 2000 English version of the message. */
7877 const char *reason = "invalid character";
7878 /* 4=maximum length of a UTF-8 sequence */
7879 char buffer[4];
7880 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7881 Py_ssize_t outsize;
7882 char *out;
7883 PyObject *errorHandler = NULL;
7884 PyObject *exc = NULL;
7885 PyObject *encoding_obj = NULL;
7886 const char *encoding;
7887 Py_ssize_t newpos, newoutsize;
7888 PyObject *rep;
7889 int ret = -1;
7890
7891 assert(insize > 0);
7892
7893 encoding = code_page_name(code_page, &encoding_obj);
7894 if (encoding == NULL)
7895 return -1;
7896
7897 if (errors == NULL || strcmp(errors, "strict") == 0) {
7898 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7899 then we raise a UnicodeEncodeError. */
7900 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7901 if (exc != NULL) {
7902 PyCodec_StrictErrors(exc);
7903 Py_DECREF(exc);
7904 }
7905 Py_XDECREF(encoding_obj);
7906 return -1;
7907 }
7908
7909 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7910 pusedDefaultChar = &usedDefaultChar;
7911 else
7912 pusedDefaultChar = NULL;
7913
7914 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7915 PyErr_NoMemory();
7916 goto error;
7917 }
7918 outsize = insize * Py_ARRAY_LENGTH(buffer);
7919
7920 if (*outbytes == NULL) {
7921 /* Create string object */
7922 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7923 if (*outbytes == NULL)
7924 goto error;
7925 out = PyBytes_AS_STRING(*outbytes);
7926 }
7927 else {
7928 /* Extend string object */
7929 Py_ssize_t n = PyBytes_Size(*outbytes);
7930 if (n > PY_SSIZE_T_MAX - outsize) {
7931 PyErr_NoMemory();
7932 goto error;
7933 }
7934 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7935 goto error;
7936 out = PyBytes_AS_STRING(*outbytes) + n;
7937 }
7938
7939 /* Encode the string character per character */
7940 while (pos < endin)
7941 {
7942 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7943 wchar_t chars[2];
7944 int charsize;
7945 if (ch < 0x10000) {
7946 chars[0] = (wchar_t)ch;
7947 charsize = 1;
7948 }
7949 else {
7950 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7951 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7952 charsize = 2;
7953 }
7954
7955 outsize = WideCharToMultiByte(code_page, flags,
7956 chars, charsize,
7957 buffer, Py_ARRAY_LENGTH(buffer),
7958 NULL, pusedDefaultChar);
7959 if (outsize > 0) {
7960 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7961 {
7962 pos++;
7963 memcpy(out, buffer, outsize);
7964 out += outsize;
7965 continue;
7966 }
7967 }
7968 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7969 PyErr_SetFromWindowsErr(0);
7970 goto error;
7971 }
7972
7973 rep = unicode_encode_call_errorhandler(
7974 errors, &errorHandler, encoding, reason,
7975 unicode, &exc,
7976 pos, pos + 1, &newpos);
7977 if (rep == NULL)
7978 goto error;
7979
7980 Py_ssize_t morebytes = pos - newpos;
7981 if (PyBytes_Check(rep)) {
7982 outsize = PyBytes_GET_SIZE(rep);
7983 morebytes += outsize;
7984 if (morebytes > 0) {
7985 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7986 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7987 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7988 Py_DECREF(rep);
7989 goto error;
7990 }
7991 out = PyBytes_AS_STRING(*outbytes) + offset;
7992 }
7993 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7994 out += outsize;
7995 }
7996 else {
7997 Py_ssize_t i;
7998 enum PyUnicode_Kind kind;
7999 const void *data;
8000
8001 if (PyUnicode_READY(rep) == -1) {
8002 Py_DECREF(rep);
8003 goto error;
8004 }
8005
8006 outsize = PyUnicode_GET_LENGTH(rep);
8007 morebytes += outsize;
8008 if (morebytes > 0) {
8009 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8010 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8011 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8012 Py_DECREF(rep);
8013 goto error;
8014 }
8015 out = PyBytes_AS_STRING(*outbytes) + offset;
8016 }
8017 kind = PyUnicode_KIND(rep);
8018 data = PyUnicode_DATA(rep);
8019 for (i=0; i < outsize; i++) {
8020 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8021 if (ch > 127) {
8022 raise_encode_exception(&exc,
8023 encoding, unicode,
8024 pos, pos + 1,
8025 "unable to encode error handler result to ASCII");
8026 Py_DECREF(rep);
8027 goto error;
8028 }
8029 *out = (unsigned char)ch;
8030 out++;
8031 }
8032 }
8033 pos = newpos;
8034 Py_DECREF(rep);
8035 }
8036 /* write a NUL byte */
8037 *out = 0;
8038 outsize = out - PyBytes_AS_STRING(*outbytes);
8039 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8040 if (_PyBytes_Resize(outbytes, outsize) < 0)
8041 goto error;
8042 ret = 0;
8043
8044 error:
8045 Py_XDECREF(encoding_obj);
8046 Py_XDECREF(errorHandler);
8047 Py_XDECREF(exc);
8048 return ret;
8049 }
8050
8051 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8052 encode_code_page(int code_page,
8053 PyObject *unicode,
8054 const char *errors)
8055 {
8056 Py_ssize_t len;
8057 PyObject *outbytes = NULL;
8058 Py_ssize_t offset;
8059 int chunk_len, ret, done;
8060
8061 if (!PyUnicode_Check(unicode)) {
8062 PyErr_BadArgument();
8063 return NULL;
8064 }
8065
8066 if (PyUnicode_READY(unicode) == -1)
8067 return NULL;
8068 len = PyUnicode_GET_LENGTH(unicode);
8069
8070 if (code_page < 0) {
8071 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8072 return NULL;
8073 }
8074
8075 if (len == 0)
8076 return PyBytes_FromStringAndSize(NULL, 0);
8077
8078 offset = 0;
8079 do
8080 {
8081 #ifdef NEED_RETRY
8082 if (len > DECODING_CHUNK_SIZE) {
8083 chunk_len = DECODING_CHUNK_SIZE;
8084 done = 0;
8085 }
8086 else
8087 #endif
8088 {
8089 chunk_len = (int)len;
8090 done = 1;
8091 }
8092
8093 ret = encode_code_page_strict(code_page, &outbytes,
8094 unicode, offset, chunk_len,
8095 errors);
8096 if (ret == -2)
8097 ret = encode_code_page_errors(code_page, &outbytes,
8098 unicode, offset,
8099 chunk_len, errors);
8100 if (ret < 0) {
8101 Py_XDECREF(outbytes);
8102 return NULL;
8103 }
8104
8105 offset += chunk_len;
8106 len -= chunk_len;
8107 } while (!done);
8108
8109 return outbytes;
8110 }
8111
8112 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8113 PyUnicode_EncodeCodePage(int code_page,
8114 PyObject *unicode,
8115 const char *errors)
8116 {
8117 return encode_code_page(code_page, unicode, errors);
8118 }
8119
8120 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8121 PyUnicode_AsMBCSString(PyObject *unicode)
8122 {
8123 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8124 }
8125
8126 #undef NEED_RETRY
8127
8128 #endif /* MS_WINDOWS */
8129
8130 /* --- Character Mapping Codec -------------------------------------------- */
8131
8132 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8133 charmap_decode_string(const char *s,
8134 Py_ssize_t size,
8135 PyObject *mapping,
8136 const char *errors,
8137 _PyUnicodeWriter *writer)
8138 {
8139 const char *starts = s;
8140 const char *e;
8141 Py_ssize_t startinpos, endinpos;
8142 PyObject *errorHandler = NULL, *exc = NULL;
8143 Py_ssize_t maplen;
8144 enum PyUnicode_Kind mapkind;
8145 const void *mapdata;
8146 Py_UCS4 x;
8147 unsigned char ch;
8148
8149 if (PyUnicode_READY(mapping) == -1)
8150 return -1;
8151
8152 maplen = PyUnicode_GET_LENGTH(mapping);
8153 mapdata = PyUnicode_DATA(mapping);
8154 mapkind = PyUnicode_KIND(mapping);
8155
8156 e = s + size;
8157
8158 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8159 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8160 * is disabled in encoding aliases, latin1 is preferred because
8161 * its implementation is faster. */
8162 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8163 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8164 Py_UCS4 maxchar = writer->maxchar;
8165
8166 assert (writer->kind == PyUnicode_1BYTE_KIND);
8167 while (s < e) {
8168 ch = *s;
8169 x = mapdata_ucs1[ch];
8170 if (x > maxchar) {
8171 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8172 goto onError;
8173 maxchar = writer->maxchar;
8174 outdata = (Py_UCS1 *)writer->data;
8175 }
8176 outdata[writer->pos] = x;
8177 writer->pos++;
8178 ++s;
8179 }
8180 return 0;
8181 }
8182
8183 while (s < e) {
8184 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8185 enum PyUnicode_Kind outkind = writer->kind;
8186 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8187 if (outkind == PyUnicode_1BYTE_KIND) {
8188 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8189 Py_UCS4 maxchar = writer->maxchar;
8190 while (s < e) {
8191 ch = *s;
8192 x = mapdata_ucs2[ch];
8193 if (x > maxchar)
8194 goto Error;
8195 outdata[writer->pos] = x;
8196 writer->pos++;
8197 ++s;
8198 }
8199 break;
8200 }
8201 else if (outkind == PyUnicode_2BYTE_KIND) {
8202 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8203 while (s < e) {
8204 ch = *s;
8205 x = mapdata_ucs2[ch];
8206 if (x == 0xFFFE)
8207 goto Error;
8208 outdata[writer->pos] = x;
8209 writer->pos++;
8210 ++s;
8211 }
8212 break;
8213 }
8214 }
8215 ch = *s;
8216
8217 if (ch < maplen)
8218 x = PyUnicode_READ(mapkind, mapdata, ch);
8219 else
8220 x = 0xfffe; /* invalid value */
8221 Error:
8222 if (x == 0xfffe)
8223 {
8224 /* undefined mapping */
8225 startinpos = s-starts;
8226 endinpos = startinpos+1;
8227 if (unicode_decode_call_errorhandler_writer(
8228 errors, &errorHandler,
8229 "charmap", "character maps to <undefined>",
8230 &starts, &e, &startinpos, &endinpos, &exc, &s,
8231 writer)) {
8232 goto onError;
8233 }
8234 continue;
8235 }
8236
8237 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8238 goto onError;
8239 ++s;
8240 }
8241 Py_XDECREF(errorHandler);
8242 Py_XDECREF(exc);
8243 return 0;
8244
8245 onError:
8246 Py_XDECREF(errorHandler);
8247 Py_XDECREF(exc);
8248 return -1;
8249 }
8250
8251 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8252 charmap_decode_mapping(const char *s,
8253 Py_ssize_t size,
8254 PyObject *mapping,
8255 const char *errors,
8256 _PyUnicodeWriter *writer)
8257 {
8258 const char *starts = s;
8259 const char *e;
8260 Py_ssize_t startinpos, endinpos;
8261 PyObject *errorHandler = NULL, *exc = NULL;
8262 unsigned char ch;
8263 PyObject *key, *item = NULL;
8264
8265 e = s + size;
8266
8267 while (s < e) {
8268 ch = *s;
8269
8270 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8271 key = PyLong_FromLong((long)ch);
8272 if (key == NULL)
8273 goto onError;
8274
8275 item = PyObject_GetItem(mapping, key);
8276 Py_DECREF(key);
8277 if (item == NULL) {
8278 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8279 /* No mapping found means: mapping is undefined. */
8280 PyErr_Clear();
8281 goto Undefined;
8282 } else
8283 goto onError;
8284 }
8285
8286 /* Apply mapping */
8287 if (item == Py_None)
8288 goto Undefined;
8289 if (PyLong_Check(item)) {
8290 long value = PyLong_AS_LONG(item);
8291 if (value == 0xFFFE)
8292 goto Undefined;
8293 if (value < 0 || value > MAX_UNICODE) {
8294 PyErr_Format(PyExc_TypeError,
8295 "character mapping must be in range(0x%x)",
8296 (unsigned long)MAX_UNICODE + 1);
8297 goto onError;
8298 }
8299
8300 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8301 goto onError;
8302 }
8303 else if (PyUnicode_Check(item)) {
8304 if (PyUnicode_READY(item) == -1)
8305 goto onError;
8306 if (PyUnicode_GET_LENGTH(item) == 1) {
8307 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8308 if (value == 0xFFFE)
8309 goto Undefined;
8310 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8311 goto onError;
8312 }
8313 else {
8314 writer->overallocate = 1;
8315 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8316 goto onError;
8317 }
8318 }
8319 else {
8320 /* wrong return value */
8321 PyErr_SetString(PyExc_TypeError,
8322 "character mapping must return integer, None or str");
8323 goto onError;
8324 }
8325 Py_CLEAR(item);
8326 ++s;
8327 continue;
8328
8329 Undefined:
8330 /* undefined mapping */
8331 Py_CLEAR(item);
8332 startinpos = s-starts;
8333 endinpos = startinpos+1;
8334 if (unicode_decode_call_errorhandler_writer(
8335 errors, &errorHandler,
8336 "charmap", "character maps to <undefined>",
8337 &starts, &e, &startinpos, &endinpos, &exc, &s,
8338 writer)) {
8339 goto onError;
8340 }
8341 }
8342 Py_XDECREF(errorHandler);
8343 Py_XDECREF(exc);
8344 return 0;
8345
8346 onError:
8347 Py_XDECREF(item);
8348 Py_XDECREF(errorHandler);
8349 Py_XDECREF(exc);
8350 return -1;
8351 }
8352
8353 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8354 PyUnicode_DecodeCharmap(const char *s,
8355 Py_ssize_t size,
8356 PyObject *mapping,
8357 const char *errors)
8358 {
8359 _PyUnicodeWriter writer;
8360
8361 /* Default to Latin-1 */
8362 if (mapping == NULL)
8363 return PyUnicode_DecodeLatin1(s, size, errors);
8364
8365 if (size == 0)
8366 _Py_RETURN_UNICODE_EMPTY();
8367 _PyUnicodeWriter_Init(&writer);
8368 writer.min_length = size;
8369 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8370 goto onError;
8371
8372 if (PyUnicode_CheckExact(mapping)) {
8373 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8374 goto onError;
8375 }
8376 else {
8377 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8378 goto onError;
8379 }
8380 return _PyUnicodeWriter_Finish(&writer);
8381
8382 onError:
8383 _PyUnicodeWriter_Dealloc(&writer);
8384 return NULL;
8385 }
8386
8387 /* Charmap encoding: the lookup table */
8388
8389 /*[clinic input]
8390 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8391 [clinic start generated code]*/
8392 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8393
8394 struct encoding_map {
8395 PyObject_HEAD
8396 unsigned char level1[32];
8397 int count2, count3;
8398 unsigned char level23[1];
8399 };
8400
8401 /*[clinic input]
8402 EncodingMap.size
8403
8404 Return the size (in bytes) of this object.
8405 [clinic start generated code]*/
8406
8407 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8408 EncodingMap_size_impl(struct encoding_map *self)
8409 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8410 {
8411 return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8412 128*self->count3);
8413 }
8414
8415 static PyMethodDef encoding_map_methods[] = {
8416 ENCODINGMAP_SIZE_METHODDEF
8417 {NULL, NULL}
8418 };
8419
8420 static PyTypeObject EncodingMapType = {
8421 PyVarObject_HEAD_INIT(NULL, 0)
8422 .tp_name = "EncodingMap",
8423 .tp_basicsize = sizeof(struct encoding_map),
8424 /* methods */
8425 .tp_flags = Py_TPFLAGS_DEFAULT,
8426 .tp_methods = encoding_map_methods,
8427 };
8428
8429 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8430 PyUnicode_BuildEncodingMap(PyObject* string)
8431 {
8432 PyObject *result;
8433 struct encoding_map *mresult;
8434 int i;
8435 int need_dict = 0;
8436 unsigned char level1[32];
8437 unsigned char level2[512];
8438 unsigned char *mlevel1, *mlevel2, *mlevel3;
8439 int count2 = 0, count3 = 0;
8440 int kind;
8441 const void *data;
8442 Py_ssize_t length;
8443 Py_UCS4 ch;
8444
8445 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8446 PyErr_BadArgument();
8447 return NULL;
8448 }
8449 kind = PyUnicode_KIND(string);
8450 data = PyUnicode_DATA(string);
8451 length = PyUnicode_GET_LENGTH(string);
8452 length = Py_MIN(length, 256);
8453 memset(level1, 0xFF, sizeof level1);
8454 memset(level2, 0xFF, sizeof level2);
8455
8456 /* If there isn't a one-to-one mapping of NULL to \0,
8457 or if there are non-BMP characters, we need to use
8458 a mapping dictionary. */
8459 if (PyUnicode_READ(kind, data, 0) != 0)
8460 need_dict = 1;
8461 for (i = 1; i < length; i++) {
8462 int l1, l2;
8463 ch = PyUnicode_READ(kind, data, i);
8464 if (ch == 0 || ch > 0xFFFF) {
8465 need_dict = 1;
8466 break;
8467 }
8468 if (ch == 0xFFFE)
8469 /* unmapped character */
8470 continue;
8471 l1 = ch >> 11;
8472 l2 = ch >> 7;
8473 if (level1[l1] == 0xFF)
8474 level1[l1] = count2++;
8475 if (level2[l2] == 0xFF)
8476 level2[l2] = count3++;
8477 }
8478
8479 if (count2 >= 0xFF || count3 >= 0xFF)
8480 need_dict = 1;
8481
8482 if (need_dict) {
8483 PyObject *result = PyDict_New();
8484 PyObject *key, *value;
8485 if (!result)
8486 return NULL;
8487 for (i = 0; i < length; i++) {
8488 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8489 value = PyLong_FromLong(i);
8490 if (!key || !value)
8491 goto failed1;
8492 if (PyDict_SetItem(result, key, value) == -1)
8493 goto failed1;
8494 Py_DECREF(key);
8495 Py_DECREF(value);
8496 }
8497 return result;
8498 failed1:
8499 Py_XDECREF(key);
8500 Py_XDECREF(value);
8501 Py_DECREF(result);
8502 return NULL;
8503 }
8504
8505 /* Create a three-level trie */
8506 result = PyObject_Malloc(sizeof(struct encoding_map) +
8507 16*count2 + 128*count3 - 1);
8508 if (!result) {
8509 return PyErr_NoMemory();
8510 }
8511
8512 _PyObject_Init(result, &EncodingMapType);
8513 mresult = (struct encoding_map*)result;
8514 mresult->count2 = count2;
8515 mresult->count3 = count3;
8516 mlevel1 = mresult->level1;
8517 mlevel2 = mresult->level23;
8518 mlevel3 = mresult->level23 + 16*count2;
8519 memcpy(mlevel1, level1, 32);
8520 memset(mlevel2, 0xFF, 16*count2);
8521 memset(mlevel3, 0, 128*count3);
8522 count3 = 0;
8523 for (i = 1; i < length; i++) {
8524 int o1, o2, o3, i2, i3;
8525 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8526 if (ch == 0xFFFE)
8527 /* unmapped character */
8528 continue;
8529 o1 = ch>>11;
8530 o2 = (ch>>7) & 0xF;
8531 i2 = 16*mlevel1[o1] + o2;
8532 if (mlevel2[i2] == 0xFF)
8533 mlevel2[i2] = count3++;
8534 o3 = ch & 0x7F;
8535 i3 = 128*mlevel2[i2] + o3;
8536 mlevel3[i3] = i;
8537 }
8538 return result;
8539 }
8540
8541 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8542 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8543 {
8544 struct encoding_map *map = (struct encoding_map*)mapping;
8545 int l1 = c>>11;
8546 int l2 = (c>>7) & 0xF;
8547 int l3 = c & 0x7F;
8548 int i;
8549
8550 if (c > 0xFFFF)
8551 return -1;
8552 if (c == 0)
8553 return 0;
8554 /* level 1*/
8555 i = map->level1[l1];
8556 if (i == 0xFF) {
8557 return -1;
8558 }
8559 /* level 2*/
8560 i = map->level23[16*i+l2];
8561 if (i == 0xFF) {
8562 return -1;
8563 }
8564 /* level 3 */
8565 i = map->level23[16*map->count2 + 128*i + l3];
8566 if (i == 0) {
8567 return -1;
8568 }
8569 return i;
8570 }
8571
8572 /* Lookup the character ch in the mapping. If the character
8573 can't be found, Py_None is returned (or NULL, if another
8574 error occurred). */
8575 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8576 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8577 {
8578 PyObject *w = PyLong_FromLong((long)c);
8579 PyObject *x;
8580
8581 if (w == NULL)
8582 return NULL;
8583 x = PyObject_GetItem(mapping, w);
8584 Py_DECREF(w);
8585 if (x == NULL) {
8586 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8587 /* No mapping found means: mapping is undefined. */
8588 PyErr_Clear();
8589 Py_RETURN_NONE;
8590 } else
8591 return NULL;
8592 }
8593 else if (x == Py_None)
8594 return x;
8595 else if (PyLong_Check(x)) {
8596 long value = PyLong_AS_LONG(x);
8597 if (value < 0 || value > 255) {
8598 PyErr_SetString(PyExc_TypeError,
8599 "character mapping must be in range(256)");
8600 Py_DECREF(x);
8601 return NULL;
8602 }
8603 return x;
8604 }
8605 else if (PyBytes_Check(x))
8606 return x;
8607 else {
8608 /* wrong return value */
8609 PyErr_Format(PyExc_TypeError,
8610 "character mapping must return integer, bytes or None, not %.400s",
8611 Py_TYPE(x)->tp_name);
8612 Py_DECREF(x);
8613 return NULL;
8614 }
8615 }
8616
8617 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8618 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8619 {
8620 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8621 /* exponentially overallocate to minimize reallocations */
8622 if (requiredsize < 2*outsize)
8623 requiredsize = 2*outsize;
8624 if (_PyBytes_Resize(outobj, requiredsize))
8625 return -1;
8626 return 0;
8627 }
8628
8629 typedef enum charmapencode_result {
8630 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8631 } charmapencode_result;
8632 /* lookup the character, put the result in the output string and adjust
8633 various state variables. Resize the output bytes object if not enough
8634 space is available. Return a new reference to the object that
8635 was put in the output buffer, or Py_None, if the mapping was undefined
8636 (in which case no character was written) or NULL, if a
8637 reallocation error occurred. The caller must decref the result */
8638 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8639 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8640 PyObject **outobj, Py_ssize_t *outpos)
8641 {
8642 PyObject *rep;
8643 char *outstart;
8644 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8645
8646 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8647 int res = encoding_map_lookup(c, mapping);
8648 Py_ssize_t requiredsize = *outpos+1;
8649 if (res == -1)
8650 return enc_FAILED;
8651 if (outsize<requiredsize)
8652 if (charmapencode_resize(outobj, outpos, requiredsize))
8653 return enc_EXCEPTION;
8654 outstart = PyBytes_AS_STRING(*outobj);
8655 outstart[(*outpos)++] = (char)res;
8656 return enc_SUCCESS;
8657 }
8658
8659 rep = charmapencode_lookup(c, mapping);
8660 if (rep==NULL)
8661 return enc_EXCEPTION;
8662 else if (rep==Py_None) {
8663 Py_DECREF(rep);
8664 return enc_FAILED;
8665 } else {
8666 if (PyLong_Check(rep)) {
8667 Py_ssize_t requiredsize = *outpos+1;
8668 if (outsize<requiredsize)
8669 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8670 Py_DECREF(rep);
8671 return enc_EXCEPTION;
8672 }
8673 outstart = PyBytes_AS_STRING(*outobj);
8674 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8675 }
8676 else {
8677 const char *repchars = PyBytes_AS_STRING(rep);
8678 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8679 Py_ssize_t requiredsize = *outpos+repsize;
8680 if (outsize<requiredsize)
8681 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8682 Py_DECREF(rep);
8683 return enc_EXCEPTION;
8684 }
8685 outstart = PyBytes_AS_STRING(*outobj);
8686 memcpy(outstart + *outpos, repchars, repsize);
8687 *outpos += repsize;
8688 }
8689 }
8690 Py_DECREF(rep);
8691 return enc_SUCCESS;
8692 }
8693
8694 /* handle an error in PyUnicode_EncodeCharmap
8695 Return 0 on success, -1 on error */
8696 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8697 charmap_encoding_error(
8698 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8699 PyObject **exceptionObject,
8700 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8701 PyObject **res, Py_ssize_t *respos)
8702 {
8703 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8704 Py_ssize_t size, repsize;
8705 Py_ssize_t newpos;
8706 enum PyUnicode_Kind kind;
8707 const void *data;
8708 Py_ssize_t index;
8709 /* startpos for collecting unencodable chars */
8710 Py_ssize_t collstartpos = *inpos;
8711 Py_ssize_t collendpos = *inpos+1;
8712 Py_ssize_t collpos;
8713 const char *encoding = "charmap";
8714 const char *reason = "character maps to <undefined>";
8715 charmapencode_result x;
8716 Py_UCS4 ch;
8717 int val;
8718
8719 if (PyUnicode_READY(unicode) == -1)
8720 return -1;
8721 size = PyUnicode_GET_LENGTH(unicode);
8722 /* find all unencodable characters */
8723 while (collendpos < size) {
8724 PyObject *rep;
8725 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8726 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8727 val = encoding_map_lookup(ch, mapping);
8728 if (val != -1)
8729 break;
8730 ++collendpos;
8731 continue;
8732 }
8733
8734 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8735 rep = charmapencode_lookup(ch, mapping);
8736 if (rep==NULL)
8737 return -1;
8738 else if (rep!=Py_None) {
8739 Py_DECREF(rep);
8740 break;
8741 }
8742 Py_DECREF(rep);
8743 ++collendpos;
8744 }
8745 /* cache callback name lookup
8746 * (if not done yet, i.e. it's the first error) */
8747 if (*error_handler == _Py_ERROR_UNKNOWN)
8748 *error_handler = _Py_GetErrorHandler(errors);
8749
8750 switch (*error_handler) {
8751 case _Py_ERROR_STRICT:
8752 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8753 return -1;
8754
8755 case _Py_ERROR_REPLACE:
8756 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8757 x = charmapencode_output('?', mapping, res, respos);
8758 if (x==enc_EXCEPTION) {
8759 return -1;
8760 }
8761 else if (x==enc_FAILED) {
8762 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8763 return -1;
8764 }
8765 }
8766 /* fall through */
8767 case _Py_ERROR_IGNORE:
8768 *inpos = collendpos;
8769 break;
8770
8771 case _Py_ERROR_XMLCHARREFREPLACE:
8772 /* generate replacement (temporarily (mis)uses p) */
8773 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8774 char buffer[2+29+1+1];
8775 char *cp;
8776 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8777 for (cp = buffer; *cp; ++cp) {
8778 x = charmapencode_output(*cp, mapping, res, respos);
8779 if (x==enc_EXCEPTION)
8780 return -1;
8781 else if (x==enc_FAILED) {
8782 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8783 return -1;
8784 }
8785 }
8786 }
8787 *inpos = collendpos;
8788 break;
8789
8790 default:
8791 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8792 encoding, reason, unicode, exceptionObject,
8793 collstartpos, collendpos, &newpos);
8794 if (repunicode == NULL)
8795 return -1;
8796 if (PyBytes_Check(repunicode)) {
8797 /* Directly copy bytes result to output. */
8798 Py_ssize_t outsize = PyBytes_Size(*res);
8799 Py_ssize_t requiredsize;
8800 repsize = PyBytes_Size(repunicode);
8801 requiredsize = *respos + repsize;
8802 if (requiredsize > outsize)
8803 /* Make room for all additional bytes. */
8804 if (charmapencode_resize(res, respos, requiredsize)) {
8805 Py_DECREF(repunicode);
8806 return -1;
8807 }
8808 memcpy(PyBytes_AsString(*res) + *respos,
8809 PyBytes_AsString(repunicode), repsize);
8810 *respos += repsize;
8811 *inpos = newpos;
8812 Py_DECREF(repunicode);
8813 break;
8814 }
8815 /* generate replacement */
8816 if (PyUnicode_READY(repunicode) == -1) {
8817 Py_DECREF(repunicode);
8818 return -1;
8819 }
8820 repsize = PyUnicode_GET_LENGTH(repunicode);
8821 data = PyUnicode_DATA(repunicode);
8822 kind = PyUnicode_KIND(repunicode);
8823 for (index = 0; index < repsize; index++) {
8824 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8825 x = charmapencode_output(repch, mapping, res, respos);
8826 if (x==enc_EXCEPTION) {
8827 Py_DECREF(repunicode);
8828 return -1;
8829 }
8830 else if (x==enc_FAILED) {
8831 Py_DECREF(repunicode);
8832 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8833 return -1;
8834 }
8835 }
8836 *inpos = newpos;
8837 Py_DECREF(repunicode);
8838 }
8839 return 0;
8840 }
8841
8842 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8843 _PyUnicode_EncodeCharmap(PyObject *unicode,
8844 PyObject *mapping,
8845 const char *errors)
8846 {
8847 /* output object */
8848 PyObject *res = NULL;
8849 /* current input position */
8850 Py_ssize_t inpos = 0;
8851 Py_ssize_t size;
8852 /* current output position */
8853 Py_ssize_t respos = 0;
8854 PyObject *error_handler_obj = NULL;
8855 PyObject *exc = NULL;
8856 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8857 const void *data;
8858 int kind;
8859
8860 if (PyUnicode_READY(unicode) == -1)
8861 return NULL;
8862 size = PyUnicode_GET_LENGTH(unicode);
8863 data = PyUnicode_DATA(unicode);
8864 kind = PyUnicode_KIND(unicode);
8865
8866 /* Default to Latin-1 */
8867 if (mapping == NULL)
8868 return unicode_encode_ucs1(unicode, errors, 256);
8869
8870 /* allocate enough for a simple encoding without
8871 replacements, if we need more, we'll resize */
8872 res = PyBytes_FromStringAndSize(NULL, size);
8873 if (res == NULL)
8874 goto onError;
8875 if (size == 0)
8876 return res;
8877
8878 while (inpos<size) {
8879 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8880 /* try to encode it */
8881 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8882 if (x==enc_EXCEPTION) /* error */
8883 goto onError;
8884 if (x==enc_FAILED) { /* unencodable character */
8885 if (charmap_encoding_error(unicode, &inpos, mapping,
8886 &exc,
8887 &error_handler, &error_handler_obj, errors,
8888 &res, &respos)) {
8889 goto onError;
8890 }
8891 }
8892 else
8893 /* done with this character => adjust input position */
8894 ++inpos;
8895 }
8896
8897 /* Resize if we allocated to much */
8898 if (respos<PyBytes_GET_SIZE(res))
8899 if (_PyBytes_Resize(&res, respos) < 0)
8900 goto onError;
8901
8902 Py_XDECREF(exc);
8903 Py_XDECREF(error_handler_obj);
8904 return res;
8905
8906 onError:
8907 Py_XDECREF(res);
8908 Py_XDECREF(exc);
8909 Py_XDECREF(error_handler_obj);
8910 return NULL;
8911 }
8912
8913 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8914 PyUnicode_AsCharmapString(PyObject *unicode,
8915 PyObject *mapping)
8916 {
8917 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8918 PyErr_BadArgument();
8919 return NULL;
8920 }
8921 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8922 }
8923
8924 /* create or adjust a UnicodeTranslateError */
8925 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8926 make_translate_exception(PyObject **exceptionObject,
8927 PyObject *unicode,
8928 Py_ssize_t startpos, Py_ssize_t endpos,
8929 const char *reason)
8930 {
8931 if (*exceptionObject == NULL) {
8932 *exceptionObject = _PyUnicodeTranslateError_Create(
8933 unicode, startpos, endpos, reason);
8934 }
8935 else {
8936 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8937 goto onError;
8938 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8939 goto onError;
8940 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8941 goto onError;
8942 return;
8943 onError:
8944 Py_CLEAR(*exceptionObject);
8945 }
8946 }
8947
8948 /* error handling callback helper:
8949 build arguments, call the callback and check the arguments,
8950 put the result into newpos and return the replacement string, which
8951 has to be freed by the caller */
8952 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8953 unicode_translate_call_errorhandler(const char *errors,
8954 PyObject **errorHandler,
8955 const char *reason,
8956 PyObject *unicode, PyObject **exceptionObject,
8957 Py_ssize_t startpos, Py_ssize_t endpos,
8958 Py_ssize_t *newpos)
8959 {
8960 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8961
8962 Py_ssize_t i_newpos;
8963 PyObject *restuple;
8964 PyObject *resunicode;
8965
8966 if (*errorHandler == NULL) {
8967 *errorHandler = PyCodec_LookupError(errors);
8968 if (*errorHandler == NULL)
8969 return NULL;
8970 }
8971
8972 make_translate_exception(exceptionObject,
8973 unicode, startpos, endpos, reason);
8974 if (*exceptionObject == NULL)
8975 return NULL;
8976
8977 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8978 if (restuple == NULL)
8979 return NULL;
8980 if (!PyTuple_Check(restuple)) {
8981 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8982 Py_DECREF(restuple);
8983 return NULL;
8984 }
8985 if (!PyArg_ParseTuple(restuple, argparse,
8986 &resunicode, &i_newpos)) {
8987 Py_DECREF(restuple);
8988 return NULL;
8989 }
8990 if (i_newpos<0)
8991 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8992 else
8993 *newpos = i_newpos;
8994 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8995 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8996 Py_DECREF(restuple);
8997 return NULL;
8998 }
8999 Py_INCREF(resunicode);
9000 Py_DECREF(restuple);
9001 return resunicode;
9002 }
9003
9004 /* Lookup the character ch in the mapping and put the result in result,
9005 which must be decrefed by the caller.
9006 Return 0 on success, -1 on error */
9007 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)9008 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9009 {
9010 PyObject *w = PyLong_FromLong((long)c);
9011 PyObject *x;
9012
9013 if (w == NULL)
9014 return -1;
9015 x = PyObject_GetItem(mapping, w);
9016 Py_DECREF(w);
9017 if (x == NULL) {
9018 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9019 /* No mapping found means: use 1:1 mapping. */
9020 PyErr_Clear();
9021 *result = NULL;
9022 return 0;
9023 } else
9024 return -1;
9025 }
9026 else if (x == Py_None) {
9027 *result = x;
9028 return 0;
9029 }
9030 else if (PyLong_Check(x)) {
9031 long value = PyLong_AS_LONG(x);
9032 if (value < 0 || value > MAX_UNICODE) {
9033 PyErr_Format(PyExc_ValueError,
9034 "character mapping must be in range(0x%x)",
9035 MAX_UNICODE+1);
9036 Py_DECREF(x);
9037 return -1;
9038 }
9039 *result = x;
9040 return 0;
9041 }
9042 else if (PyUnicode_Check(x)) {
9043 *result = x;
9044 return 0;
9045 }
9046 else {
9047 /* wrong return value */
9048 PyErr_SetString(PyExc_TypeError,
9049 "character mapping must return integer, None or str");
9050 Py_DECREF(x);
9051 return -1;
9052 }
9053 }
9054
9055 /* lookup the character, write the result into the writer.
9056 Return 1 if the result was written into the writer, return 0 if the mapping
9057 was undefined, raise an exception return -1 on error. */
9058 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9059 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9060 _PyUnicodeWriter *writer)
9061 {
9062 PyObject *item;
9063
9064 if (charmaptranslate_lookup(ch, mapping, &item))
9065 return -1;
9066
9067 if (item == NULL) {
9068 /* not found => default to 1:1 mapping */
9069 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9070 return -1;
9071 }
9072 return 1;
9073 }
9074
9075 if (item == Py_None) {
9076 Py_DECREF(item);
9077 return 0;
9078 }
9079
9080 if (PyLong_Check(item)) {
9081 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9082 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9083 used it */
9084 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9085 Py_DECREF(item);
9086 return -1;
9087 }
9088 Py_DECREF(item);
9089 return 1;
9090 }
9091
9092 if (!PyUnicode_Check(item)) {
9093 Py_DECREF(item);
9094 return -1;
9095 }
9096
9097 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9098 Py_DECREF(item);
9099 return -1;
9100 }
9101
9102 Py_DECREF(item);
9103 return 1;
9104 }
9105
9106 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9107 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9108 Py_UCS1 *translate)
9109 {
9110 PyObject *item = NULL;
9111 int ret = 0;
9112
9113 if (charmaptranslate_lookup(ch, mapping, &item)) {
9114 return -1;
9115 }
9116
9117 if (item == Py_None) {
9118 /* deletion */
9119 translate[ch] = 0xfe;
9120 }
9121 else if (item == NULL) {
9122 /* not found => default to 1:1 mapping */
9123 translate[ch] = ch;
9124 return 1;
9125 }
9126 else if (PyLong_Check(item)) {
9127 long replace = PyLong_AS_LONG(item);
9128 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9129 used it */
9130 if (127 < replace) {
9131 /* invalid character or character outside ASCII:
9132 skip the fast translate */
9133 goto exit;
9134 }
9135 translate[ch] = (Py_UCS1)replace;
9136 }
9137 else if (PyUnicode_Check(item)) {
9138 Py_UCS4 replace;
9139
9140 if (PyUnicode_READY(item) == -1) {
9141 Py_DECREF(item);
9142 return -1;
9143 }
9144 if (PyUnicode_GET_LENGTH(item) != 1)
9145 goto exit;
9146
9147 replace = PyUnicode_READ_CHAR(item, 0);
9148 if (replace > 127)
9149 goto exit;
9150 translate[ch] = (Py_UCS1)replace;
9151 }
9152 else {
9153 /* not None, NULL, long or unicode */
9154 goto exit;
9155 }
9156 ret = 1;
9157
9158 exit:
9159 Py_DECREF(item);
9160 return ret;
9161 }
9162
9163 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9164 was translated into writer, return 0 if the input string was partially
9165 translated into writer, raise an exception and return -1 on error. */
9166 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9167 unicode_fast_translate(PyObject *input, PyObject *mapping,
9168 _PyUnicodeWriter *writer, int ignore,
9169 Py_ssize_t *input_pos)
9170 {
9171 Py_UCS1 ascii_table[128], ch, ch2;
9172 Py_ssize_t len;
9173 const Py_UCS1 *in, *end;
9174 Py_UCS1 *out;
9175 int res = 0;
9176
9177 len = PyUnicode_GET_LENGTH(input);
9178
9179 memset(ascii_table, 0xff, 128);
9180
9181 in = PyUnicode_1BYTE_DATA(input);
9182 end = in + len;
9183
9184 assert(PyUnicode_IS_ASCII(writer->buffer));
9185 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9186 out = PyUnicode_1BYTE_DATA(writer->buffer);
9187
9188 for (; in < end; in++) {
9189 ch = *in;
9190 ch2 = ascii_table[ch];
9191 if (ch2 == 0xff) {
9192 int translate = unicode_fast_translate_lookup(mapping, ch,
9193 ascii_table);
9194 if (translate < 0)
9195 return -1;
9196 if (translate == 0)
9197 goto exit;
9198 ch2 = ascii_table[ch];
9199 }
9200 if (ch2 == 0xfe) {
9201 if (ignore)
9202 continue;
9203 goto exit;
9204 }
9205 assert(ch2 < 128);
9206 *out = ch2;
9207 out++;
9208 }
9209 res = 1;
9210
9211 exit:
9212 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9213 *input_pos = in - PyUnicode_1BYTE_DATA(input);
9214 return res;
9215 }
9216
9217 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9218 _PyUnicode_TranslateCharmap(PyObject *input,
9219 PyObject *mapping,
9220 const char *errors)
9221 {
9222 /* input object */
9223 const void *data;
9224 Py_ssize_t size, i;
9225 int kind;
9226 /* output buffer */
9227 _PyUnicodeWriter writer;
9228 /* error handler */
9229 const char *reason = "character maps to <undefined>";
9230 PyObject *errorHandler = NULL;
9231 PyObject *exc = NULL;
9232 int ignore;
9233 int res;
9234
9235 if (mapping == NULL) {
9236 PyErr_BadArgument();
9237 return NULL;
9238 }
9239
9240 if (PyUnicode_READY(input) == -1)
9241 return NULL;
9242 data = PyUnicode_DATA(input);
9243 kind = PyUnicode_KIND(input);
9244 size = PyUnicode_GET_LENGTH(input);
9245
9246 if (size == 0)
9247 return PyUnicode_FromObject(input);
9248
9249 /* allocate enough for a simple 1:1 translation without
9250 replacements, if we need more, we'll resize */
9251 _PyUnicodeWriter_Init(&writer);
9252 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9253 goto onError;
9254
9255 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9256
9257 if (PyUnicode_READY(input) == -1)
9258 return NULL;
9259 if (PyUnicode_IS_ASCII(input)) {
9260 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9261 if (res < 0) {
9262 _PyUnicodeWriter_Dealloc(&writer);
9263 return NULL;
9264 }
9265 if (res == 1)
9266 return _PyUnicodeWriter_Finish(&writer);
9267 }
9268 else {
9269 i = 0;
9270 }
9271
9272 while (i<size) {
9273 /* try to encode it */
9274 int translate;
9275 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9276 Py_ssize_t newpos;
9277 /* startpos for collecting untranslatable chars */
9278 Py_ssize_t collstart;
9279 Py_ssize_t collend;
9280 Py_UCS4 ch;
9281
9282 ch = PyUnicode_READ(kind, data, i);
9283 translate = charmaptranslate_output(ch, mapping, &writer);
9284 if (translate < 0)
9285 goto onError;
9286
9287 if (translate != 0) {
9288 /* it worked => adjust input pointer */
9289 ++i;
9290 continue;
9291 }
9292
9293 /* untranslatable character */
9294 collstart = i;
9295 collend = i+1;
9296
9297 /* find all untranslatable characters */
9298 while (collend < size) {
9299 PyObject *x;
9300 ch = PyUnicode_READ(kind, data, collend);
9301 if (charmaptranslate_lookup(ch, mapping, &x))
9302 goto onError;
9303 Py_XDECREF(x);
9304 if (x != Py_None)
9305 break;
9306 ++collend;
9307 }
9308
9309 if (ignore) {
9310 i = collend;
9311 }
9312 else {
9313 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9314 reason, input, &exc,
9315 collstart, collend, &newpos);
9316 if (repunicode == NULL)
9317 goto onError;
9318 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9319 Py_DECREF(repunicode);
9320 goto onError;
9321 }
9322 Py_DECREF(repunicode);
9323 i = newpos;
9324 }
9325 }
9326 Py_XDECREF(exc);
9327 Py_XDECREF(errorHandler);
9328 return _PyUnicodeWriter_Finish(&writer);
9329
9330 onError:
9331 _PyUnicodeWriter_Dealloc(&writer);
9332 Py_XDECREF(exc);
9333 Py_XDECREF(errorHandler);
9334 return NULL;
9335 }
9336
9337 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9338 PyUnicode_Translate(PyObject *str,
9339 PyObject *mapping,
9340 const char *errors)
9341 {
9342 if (ensure_unicode(str) < 0)
9343 return NULL;
9344 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9345 }
9346
9347 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9348 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9349 {
9350 if (!PyUnicode_Check(unicode)) {
9351 PyErr_BadInternalCall();
9352 return NULL;
9353 }
9354 if (PyUnicode_READY(unicode) == -1)
9355 return NULL;
9356 if (PyUnicode_IS_ASCII(unicode)) {
9357 /* If the string is already ASCII, just return the same string */
9358 Py_INCREF(unicode);
9359 return unicode;
9360 }
9361
9362 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9363 PyObject *result = PyUnicode_New(len, 127);
9364 if (result == NULL) {
9365 return NULL;
9366 }
9367
9368 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9369 int kind = PyUnicode_KIND(unicode);
9370 const void *data = PyUnicode_DATA(unicode);
9371 Py_ssize_t i;
9372 for (i = 0; i < len; ++i) {
9373 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9374 if (ch < 127) {
9375 out[i] = ch;
9376 }
9377 else if (Py_UNICODE_ISSPACE(ch)) {
9378 out[i] = ' ';
9379 }
9380 else {
9381 int decimal = Py_UNICODE_TODECIMAL(ch);
9382 if (decimal < 0) {
9383 out[i] = '?';
9384 out[i+1] = '\0';
9385 _PyUnicode_LENGTH(result) = i + 1;
9386 break;
9387 }
9388 out[i] = '0' + decimal;
9389 }
9390 }
9391
9392 assert(_PyUnicode_CheckConsistency(result, 1));
9393 return result;
9394 }
9395
9396 /* --- Helpers ------------------------------------------------------------ */
9397
9398 /* helper macro to fixup start/end slice values */
9399 #define ADJUST_INDICES(start, end, len) \
9400 if (end > len) \
9401 end = len; \
9402 else if (end < 0) { \
9403 end += len; \
9404 if (end < 0) \
9405 end = 0; \
9406 } \
9407 if (start < 0) { \
9408 start += len; \
9409 if (start < 0) \
9410 start = 0; \
9411 }
9412
9413 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9414 any_find_slice(PyObject* s1, PyObject* s2,
9415 Py_ssize_t start,
9416 Py_ssize_t end,
9417 int direction)
9418 {
9419 int kind1, kind2;
9420 const void *buf1, *buf2;
9421 Py_ssize_t len1, len2, result;
9422
9423 kind1 = PyUnicode_KIND(s1);
9424 kind2 = PyUnicode_KIND(s2);
9425 if (kind1 < kind2)
9426 return -1;
9427
9428 len1 = PyUnicode_GET_LENGTH(s1);
9429 len2 = PyUnicode_GET_LENGTH(s2);
9430 ADJUST_INDICES(start, end, len1);
9431 if (end - start < len2)
9432 return -1;
9433
9434 buf1 = PyUnicode_DATA(s1);
9435 buf2 = PyUnicode_DATA(s2);
9436 if (len2 == 1) {
9437 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9438 result = findchar((const char *)buf1 + kind1*start,
9439 kind1, end - start, ch, direction);
9440 if (result == -1)
9441 return -1;
9442 else
9443 return start + result;
9444 }
9445
9446 if (kind2 != kind1) {
9447 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9448 if (!buf2)
9449 return -2;
9450 }
9451
9452 if (direction > 0) {
9453 switch (kind1) {
9454 case PyUnicode_1BYTE_KIND:
9455 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9456 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9457 else
9458 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9459 break;
9460 case PyUnicode_2BYTE_KIND:
9461 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9462 break;
9463 case PyUnicode_4BYTE_KIND:
9464 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9465 break;
9466 default:
9467 Py_UNREACHABLE();
9468 }
9469 }
9470 else {
9471 switch (kind1) {
9472 case PyUnicode_1BYTE_KIND:
9473 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9474 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9475 else
9476 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9477 break;
9478 case PyUnicode_2BYTE_KIND:
9479 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9480 break;
9481 case PyUnicode_4BYTE_KIND:
9482 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9483 break;
9484 default:
9485 Py_UNREACHABLE();
9486 }
9487 }
9488
9489 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9490 if (kind2 != kind1)
9491 PyMem_Free((void *)buf2);
9492
9493 return result;
9494 }
9495
9496 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9497 #include "stringlib/localeutil.h"
9498
9499 /**
9500 * InsertThousandsGrouping:
9501 * @writer: Unicode writer.
9502 * @n_buffer: Number of characters in @buffer.
9503 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9504 * @d_pos: Start of digits string.
9505 * @n_digits: The number of digits in the string, in which we want
9506 * to put the grouping chars.
9507 * @min_width: The minimum width of the digits in the output string.
9508 * Output will be zero-padded on the left to fill.
9509 * @grouping: see definition in localeconv().
9510 * @thousands_sep: see definition in localeconv().
9511 *
9512 * There are 2 modes: counting and filling. If @writer is NULL,
9513 * we are in counting mode, else filling mode.
9514 * If counting, the required buffer size is returned.
9515 * If filling, we know the buffer will be large enough, so we don't
9516 * need to pass in the buffer size.
9517 * Inserts thousand grouping characters (as defined by grouping and
9518 * thousands_sep) into @writer.
9519 *
9520 * Return value: -1 on error, number of characters otherwise.
9521 **/
9522 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9523 _PyUnicode_InsertThousandsGrouping(
9524 _PyUnicodeWriter *writer,
9525 Py_ssize_t n_buffer,
9526 PyObject *digits,
9527 Py_ssize_t d_pos,
9528 Py_ssize_t n_digits,
9529 Py_ssize_t min_width,
9530 const char *grouping,
9531 PyObject *thousands_sep,
9532 Py_UCS4 *maxchar)
9533 {
9534 min_width = Py_MAX(0, min_width);
9535 if (writer) {
9536 assert(digits != NULL);
9537 assert(maxchar == NULL);
9538 }
9539 else {
9540 assert(digits == NULL);
9541 assert(maxchar != NULL);
9542 }
9543 assert(0 <= d_pos);
9544 assert(0 <= n_digits);
9545 assert(grouping != NULL);
9546
9547 if (digits != NULL) {
9548 if (PyUnicode_READY(digits) == -1) {
9549 return -1;
9550 }
9551 }
9552 if (PyUnicode_READY(thousands_sep) == -1) {
9553 return -1;
9554 }
9555
9556 Py_ssize_t count = 0;
9557 Py_ssize_t n_zeros;
9558 int loop_broken = 0;
9559 int use_separator = 0; /* First time through, don't append the
9560 separator. They only go between
9561 groups. */
9562 Py_ssize_t buffer_pos;
9563 Py_ssize_t digits_pos;
9564 Py_ssize_t len;
9565 Py_ssize_t n_chars;
9566 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9567 be looked at */
9568 /* A generator that returns all of the grouping widths, until it
9569 returns 0. */
9570 GroupGenerator groupgen;
9571 GroupGenerator_init(&groupgen, grouping);
9572 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9573
9574 /* if digits are not grouped, thousands separator
9575 should be an empty string */
9576 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9577
9578 digits_pos = d_pos + n_digits;
9579 if (writer) {
9580 buffer_pos = writer->pos + n_buffer;
9581 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9582 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9583 }
9584 else {
9585 buffer_pos = n_buffer;
9586 }
9587
9588 if (!writer) {
9589 *maxchar = 127;
9590 }
9591
9592 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9593 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9594 n_zeros = Py_MAX(0, len - remaining);
9595 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9596
9597 /* Use n_zero zero's and n_chars chars */
9598
9599 /* Count only, don't do anything. */
9600 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9601
9602 /* Copy into the writer. */
9603 InsertThousandsGrouping_fill(writer, &buffer_pos,
9604 digits, &digits_pos,
9605 n_chars, n_zeros,
9606 use_separator ? thousands_sep : NULL,
9607 thousands_sep_len, maxchar);
9608
9609 /* Use a separator next time. */
9610 use_separator = 1;
9611
9612 remaining -= n_chars;
9613 min_width -= len;
9614
9615 if (remaining <= 0 && min_width <= 0) {
9616 loop_broken = 1;
9617 break;
9618 }
9619 min_width -= thousands_sep_len;
9620 }
9621 if (!loop_broken) {
9622 /* We left the loop without using a break statement. */
9623
9624 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9625 n_zeros = Py_MAX(0, len - remaining);
9626 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9627
9628 /* Use n_zero zero's and n_chars chars */
9629 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9630
9631 /* Copy into the writer. */
9632 InsertThousandsGrouping_fill(writer, &buffer_pos,
9633 digits, &digits_pos,
9634 n_chars, n_zeros,
9635 use_separator ? thousands_sep : NULL,
9636 thousands_sep_len, maxchar);
9637 }
9638 return count;
9639 }
9640
9641
9642 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9643 PyUnicode_Count(PyObject *str,
9644 PyObject *substr,
9645 Py_ssize_t start,
9646 Py_ssize_t end)
9647 {
9648 Py_ssize_t result;
9649 int kind1, kind2;
9650 const void *buf1 = NULL, *buf2 = NULL;
9651 Py_ssize_t len1, len2;
9652
9653 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9654 return -1;
9655
9656 kind1 = PyUnicode_KIND(str);
9657 kind2 = PyUnicode_KIND(substr);
9658 if (kind1 < kind2)
9659 return 0;
9660
9661 len1 = PyUnicode_GET_LENGTH(str);
9662 len2 = PyUnicode_GET_LENGTH(substr);
9663 ADJUST_INDICES(start, end, len1);
9664 if (end - start < len2)
9665 return 0;
9666
9667 buf1 = PyUnicode_DATA(str);
9668 buf2 = PyUnicode_DATA(substr);
9669 if (kind2 != kind1) {
9670 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9671 if (!buf2)
9672 goto onError;
9673 }
9674
9675 switch (kind1) {
9676 case PyUnicode_1BYTE_KIND:
9677 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9678 result = asciilib_count(
9679 ((const Py_UCS1*)buf1) + start, end - start,
9680 buf2, len2, PY_SSIZE_T_MAX
9681 );
9682 else
9683 result = ucs1lib_count(
9684 ((const Py_UCS1*)buf1) + start, end - start,
9685 buf2, len2, PY_SSIZE_T_MAX
9686 );
9687 break;
9688 case PyUnicode_2BYTE_KIND:
9689 result = ucs2lib_count(
9690 ((const Py_UCS2*)buf1) + start, end - start,
9691 buf2, len2, PY_SSIZE_T_MAX
9692 );
9693 break;
9694 case PyUnicode_4BYTE_KIND:
9695 result = ucs4lib_count(
9696 ((const Py_UCS4*)buf1) + start, end - start,
9697 buf2, len2, PY_SSIZE_T_MAX
9698 );
9699 break;
9700 default:
9701 Py_UNREACHABLE();
9702 }
9703
9704 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9705 if (kind2 != kind1)
9706 PyMem_Free((void *)buf2);
9707
9708 return result;
9709 onError:
9710 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9711 if (kind2 != kind1)
9712 PyMem_Free((void *)buf2);
9713 return -1;
9714 }
9715
9716 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9717 PyUnicode_Find(PyObject *str,
9718 PyObject *substr,
9719 Py_ssize_t start,
9720 Py_ssize_t end,
9721 int direction)
9722 {
9723 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9724 return -2;
9725
9726 return any_find_slice(str, substr, start, end, direction);
9727 }
9728
9729 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9730 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9731 Py_ssize_t start, Py_ssize_t end,
9732 int direction)
9733 {
9734 int kind;
9735 Py_ssize_t len, result;
9736 if (PyUnicode_READY(str) == -1)
9737 return -2;
9738 len = PyUnicode_GET_LENGTH(str);
9739 ADJUST_INDICES(start, end, len);
9740 if (end - start < 1)
9741 return -1;
9742 kind = PyUnicode_KIND(str);
9743 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9744 kind, end-start, ch, direction);
9745 if (result == -1)
9746 return -1;
9747 else
9748 return start + result;
9749 }
9750
9751 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9752 tailmatch(PyObject *self,
9753 PyObject *substring,
9754 Py_ssize_t start,
9755 Py_ssize_t end,
9756 int direction)
9757 {
9758 int kind_self;
9759 int kind_sub;
9760 const void *data_self;
9761 const void *data_sub;
9762 Py_ssize_t offset;
9763 Py_ssize_t i;
9764 Py_ssize_t end_sub;
9765
9766 if (PyUnicode_READY(self) == -1 ||
9767 PyUnicode_READY(substring) == -1)
9768 return -1;
9769
9770 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9771 end -= PyUnicode_GET_LENGTH(substring);
9772 if (end < start)
9773 return 0;
9774
9775 if (PyUnicode_GET_LENGTH(substring) == 0)
9776 return 1;
9777
9778 kind_self = PyUnicode_KIND(self);
9779 data_self = PyUnicode_DATA(self);
9780 kind_sub = PyUnicode_KIND(substring);
9781 data_sub = PyUnicode_DATA(substring);
9782 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9783
9784 if (direction > 0)
9785 offset = end;
9786 else
9787 offset = start;
9788
9789 if (PyUnicode_READ(kind_self, data_self, offset) ==
9790 PyUnicode_READ(kind_sub, data_sub, 0) &&
9791 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9792 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9793 /* If both are of the same kind, memcmp is sufficient */
9794 if (kind_self == kind_sub) {
9795 return ! memcmp((char *)data_self +
9796 (offset * PyUnicode_KIND(substring)),
9797 data_sub,
9798 PyUnicode_GET_LENGTH(substring) *
9799 PyUnicode_KIND(substring));
9800 }
9801 /* otherwise we have to compare each character by first accessing it */
9802 else {
9803 /* We do not need to compare 0 and len(substring)-1 because
9804 the if statement above ensured already that they are equal
9805 when we end up here. */
9806 for (i = 1; i < end_sub; ++i) {
9807 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9808 PyUnicode_READ(kind_sub, data_sub, i))
9809 return 0;
9810 }
9811 return 1;
9812 }
9813 }
9814
9815 return 0;
9816 }
9817
9818 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9819 PyUnicode_Tailmatch(PyObject *str,
9820 PyObject *substr,
9821 Py_ssize_t start,
9822 Py_ssize_t end,
9823 int direction)
9824 {
9825 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9826 return -1;
9827
9828 return tailmatch(str, substr, start, end, direction);
9829 }
9830
9831 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9832 ascii_upper_or_lower(PyObject *self, int lower)
9833 {
9834 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9835 const char *data = PyUnicode_DATA(self);
9836 char *resdata;
9837 PyObject *res;
9838
9839 res = PyUnicode_New(len, 127);
9840 if (res == NULL)
9841 return NULL;
9842 resdata = PyUnicode_DATA(res);
9843 if (lower)
9844 _Py_bytes_lower(resdata, data, len);
9845 else
9846 _Py_bytes_upper(resdata, data, len);
9847 return res;
9848 }
9849
9850 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9851 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9852 {
9853 Py_ssize_t j;
9854 int final_sigma;
9855 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9856 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9857
9858 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9859
9860 where ! is a negation and \p{xxx} is a character with property xxx.
9861 */
9862 for (j = i - 1; j >= 0; j--) {
9863 c = PyUnicode_READ(kind, data, j);
9864 if (!_PyUnicode_IsCaseIgnorable(c))
9865 break;
9866 }
9867 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9868 if (final_sigma) {
9869 for (j = i + 1; j < length; j++) {
9870 c = PyUnicode_READ(kind, data, j);
9871 if (!_PyUnicode_IsCaseIgnorable(c))
9872 break;
9873 }
9874 final_sigma = j == length || !_PyUnicode_IsCased(c);
9875 }
9876 return (final_sigma) ? 0x3C2 : 0x3C3;
9877 }
9878
9879 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9880 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9881 Py_UCS4 c, Py_UCS4 *mapped)
9882 {
9883 /* Obscure special case. */
9884 if (c == 0x3A3) {
9885 mapped[0] = handle_capital_sigma(kind, data, length, i);
9886 return 1;
9887 }
9888 return _PyUnicode_ToLowerFull(c, mapped);
9889 }
9890
9891 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9892 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9893 {
9894 Py_ssize_t i, k = 0;
9895 int n_res, j;
9896 Py_UCS4 c, mapped[3];
9897
9898 c = PyUnicode_READ(kind, data, 0);
9899 n_res = _PyUnicode_ToTitleFull(c, mapped);
9900 for (j = 0; j < n_res; j++) {
9901 *maxchar = Py_MAX(*maxchar, mapped[j]);
9902 res[k++] = mapped[j];
9903 }
9904 for (i = 1; i < length; i++) {
9905 c = PyUnicode_READ(kind, data, i);
9906 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9907 for (j = 0; j < n_res; j++) {
9908 *maxchar = Py_MAX(*maxchar, mapped[j]);
9909 res[k++] = mapped[j];
9910 }
9911 }
9912 return k;
9913 }
9914
9915 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9916 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9917 Py_ssize_t i, k = 0;
9918
9919 for (i = 0; i < length; i++) {
9920 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921 int n_res, j;
9922 if (Py_UNICODE_ISUPPER(c)) {
9923 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924 }
9925 else if (Py_UNICODE_ISLOWER(c)) {
9926 n_res = _PyUnicode_ToUpperFull(c, mapped);
9927 }
9928 else {
9929 n_res = 1;
9930 mapped[0] = c;
9931 }
9932 for (j = 0; j < n_res; j++) {
9933 *maxchar = Py_MAX(*maxchar, mapped[j]);
9934 res[k++] = mapped[j];
9935 }
9936 }
9937 return k;
9938 }
9939
9940 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9941 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9942 Py_UCS4 *maxchar, int lower)
9943 {
9944 Py_ssize_t i, k = 0;
9945
9946 for (i = 0; i < length; i++) {
9947 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9948 int n_res, j;
9949 if (lower)
9950 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9951 else
9952 n_res = _PyUnicode_ToUpperFull(c, mapped);
9953 for (j = 0; j < n_res; j++) {
9954 *maxchar = Py_MAX(*maxchar, mapped[j]);
9955 res[k++] = mapped[j];
9956 }
9957 }
9958 return k;
9959 }
9960
9961 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9962 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9963 {
9964 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9965 }
9966
9967 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9968 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9969 {
9970 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9971 }
9972
9973 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9974 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9975 {
9976 Py_ssize_t i, k = 0;
9977
9978 for (i = 0; i < length; i++) {
9979 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9980 Py_UCS4 mapped[3];
9981 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9982 for (j = 0; j < n_res; j++) {
9983 *maxchar = Py_MAX(*maxchar, mapped[j]);
9984 res[k++] = mapped[j];
9985 }
9986 }
9987 return k;
9988 }
9989
9990 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9991 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9992 {
9993 Py_ssize_t i, k = 0;
9994 int previous_is_cased;
9995
9996 previous_is_cased = 0;
9997 for (i = 0; i < length; i++) {
9998 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9999 Py_UCS4 mapped[3];
10000 int n_res, j;
10001
10002 if (previous_is_cased)
10003 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10004 else
10005 n_res = _PyUnicode_ToTitleFull(c, mapped);
10006
10007 for (j = 0; j < n_res; j++) {
10008 *maxchar = Py_MAX(*maxchar, mapped[j]);
10009 res[k++] = mapped[j];
10010 }
10011
10012 previous_is_cased = _PyUnicode_IsCased(c);
10013 }
10014 return k;
10015 }
10016
10017 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10018 case_operation(PyObject *self,
10019 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10020 {
10021 PyObject *res = NULL;
10022 Py_ssize_t length, newlength = 0;
10023 int kind, outkind;
10024 const void *data;
10025 void *outdata;
10026 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10027
10028 assert(PyUnicode_IS_READY(self));
10029
10030 kind = PyUnicode_KIND(self);
10031 data = PyUnicode_DATA(self);
10032 length = PyUnicode_GET_LENGTH(self);
10033 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10034 PyErr_SetString(PyExc_OverflowError, "string is too long");
10035 return NULL;
10036 }
10037 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10038 if (tmp == NULL)
10039 return PyErr_NoMemory();
10040 newlength = perform(kind, data, length, tmp, &maxchar);
10041 res = PyUnicode_New(newlength, maxchar);
10042 if (res == NULL)
10043 goto leave;
10044 tmpend = tmp + newlength;
10045 outdata = PyUnicode_DATA(res);
10046 outkind = PyUnicode_KIND(res);
10047 switch (outkind) {
10048 case PyUnicode_1BYTE_KIND:
10049 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10050 break;
10051 case PyUnicode_2BYTE_KIND:
10052 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10053 break;
10054 case PyUnicode_4BYTE_KIND:
10055 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10056 break;
10057 default:
10058 Py_UNREACHABLE();
10059 }
10060 leave:
10061 PyMem_Free(tmp);
10062 return res;
10063 }
10064
10065 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10066 PyUnicode_Join(PyObject *separator, PyObject *seq)
10067 {
10068 PyObject *res;
10069 PyObject *fseq;
10070 Py_ssize_t seqlen;
10071 PyObject **items;
10072
10073 fseq = PySequence_Fast(seq, "can only join an iterable");
10074 if (fseq == NULL) {
10075 return NULL;
10076 }
10077
10078 /* NOTE: the following code can't call back into Python code,
10079 * so we are sure that fseq won't be mutated.
10080 */
10081
10082 items = PySequence_Fast_ITEMS(fseq);
10083 seqlen = PySequence_Fast_GET_SIZE(fseq);
10084 res = _PyUnicode_JoinArray(separator, items, seqlen);
10085 Py_DECREF(fseq);
10086 return res;
10087 }
10088
10089 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10090 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10091 {
10092 PyObject *res = NULL; /* the result */
10093 PyObject *sep = NULL;
10094 Py_ssize_t seplen;
10095 PyObject *item;
10096 Py_ssize_t sz, i, res_offset;
10097 Py_UCS4 maxchar;
10098 Py_UCS4 item_maxchar;
10099 int use_memcpy;
10100 unsigned char *res_data = NULL, *sep_data = NULL;
10101 PyObject *last_obj;
10102 unsigned int kind = 0;
10103
10104 /* If empty sequence, return u"". */
10105 if (seqlen == 0) {
10106 _Py_RETURN_UNICODE_EMPTY();
10107 }
10108
10109 /* If singleton sequence with an exact Unicode, return that. */
10110 last_obj = NULL;
10111 if (seqlen == 1) {
10112 if (PyUnicode_CheckExact(items[0])) {
10113 res = items[0];
10114 Py_INCREF(res);
10115 return res;
10116 }
10117 seplen = 0;
10118 maxchar = 0;
10119 }
10120 else {
10121 /* Set up sep and seplen */
10122 if (separator == NULL) {
10123 /* fall back to a blank space separator */
10124 sep = PyUnicode_FromOrdinal(' ');
10125 if (!sep)
10126 goto onError;
10127 seplen = 1;
10128 maxchar = 32;
10129 }
10130 else {
10131 if (!PyUnicode_Check(separator)) {
10132 PyErr_Format(PyExc_TypeError,
10133 "separator: expected str instance,"
10134 " %.80s found",
10135 Py_TYPE(separator)->tp_name);
10136 goto onError;
10137 }
10138 if (PyUnicode_READY(separator))
10139 goto onError;
10140 sep = separator;
10141 seplen = PyUnicode_GET_LENGTH(separator);
10142 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10143 /* inc refcount to keep this code path symmetric with the
10144 above case of a blank separator */
10145 Py_INCREF(sep);
10146 }
10147 last_obj = sep;
10148 }
10149
10150 /* There are at least two things to join, or else we have a subclass
10151 * of str in the sequence.
10152 * Do a pre-pass to figure out the total amount of space we'll
10153 * need (sz), and see whether all argument are strings.
10154 */
10155 sz = 0;
10156 #ifdef Py_DEBUG
10157 use_memcpy = 0;
10158 #else
10159 use_memcpy = 1;
10160 #endif
10161 for (i = 0; i < seqlen; i++) {
10162 size_t add_sz;
10163 item = items[i];
10164 if (!PyUnicode_Check(item)) {
10165 PyErr_Format(PyExc_TypeError,
10166 "sequence item %zd: expected str instance,"
10167 " %.80s found",
10168 i, Py_TYPE(item)->tp_name);
10169 goto onError;
10170 }
10171 if (PyUnicode_READY(item) == -1)
10172 goto onError;
10173 add_sz = PyUnicode_GET_LENGTH(item);
10174 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10175 maxchar = Py_MAX(maxchar, item_maxchar);
10176 if (i != 0) {
10177 add_sz += seplen;
10178 }
10179 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10180 PyErr_SetString(PyExc_OverflowError,
10181 "join() result is too long for a Python string");
10182 goto onError;
10183 }
10184 sz += add_sz;
10185 if (use_memcpy && last_obj != NULL) {
10186 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10187 use_memcpy = 0;
10188 }
10189 last_obj = item;
10190 }
10191
10192 res = PyUnicode_New(sz, maxchar);
10193 if (res == NULL)
10194 goto onError;
10195
10196 /* Catenate everything. */
10197 #ifdef Py_DEBUG
10198 use_memcpy = 0;
10199 #else
10200 if (use_memcpy) {
10201 res_data = PyUnicode_1BYTE_DATA(res);
10202 kind = PyUnicode_KIND(res);
10203 if (seplen != 0)
10204 sep_data = PyUnicode_1BYTE_DATA(sep);
10205 }
10206 #endif
10207 if (use_memcpy) {
10208 for (i = 0; i < seqlen; ++i) {
10209 Py_ssize_t itemlen;
10210 item = items[i];
10211
10212 /* Copy item, and maybe the separator. */
10213 if (i && seplen != 0) {
10214 memcpy(res_data,
10215 sep_data,
10216 kind * seplen);
10217 res_data += kind * seplen;
10218 }
10219
10220 itemlen = PyUnicode_GET_LENGTH(item);
10221 if (itemlen != 0) {
10222 memcpy(res_data,
10223 PyUnicode_DATA(item),
10224 kind * itemlen);
10225 res_data += kind * itemlen;
10226 }
10227 }
10228 assert(res_data == PyUnicode_1BYTE_DATA(res)
10229 + kind * PyUnicode_GET_LENGTH(res));
10230 }
10231 else {
10232 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10233 Py_ssize_t itemlen;
10234 item = items[i];
10235
10236 /* Copy item, and maybe the separator. */
10237 if (i && seplen != 0) {
10238 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10239 res_offset += seplen;
10240 }
10241
10242 itemlen = PyUnicode_GET_LENGTH(item);
10243 if (itemlen != 0) {
10244 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10245 res_offset += itemlen;
10246 }
10247 }
10248 assert(res_offset == PyUnicode_GET_LENGTH(res));
10249 }
10250
10251 Py_XDECREF(sep);
10252 assert(_PyUnicode_CheckConsistency(res, 1));
10253 return res;
10254
10255 onError:
10256 Py_XDECREF(sep);
10257 Py_XDECREF(res);
10258 return NULL;
10259 }
10260
10261 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10262 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10263 Py_UCS4 fill_char)
10264 {
10265 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10266 void *data = PyUnicode_DATA(unicode);
10267 assert(PyUnicode_IS_READY(unicode));
10268 assert(unicode_modifiable(unicode));
10269 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10270 assert(start >= 0);
10271 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10272 unicode_fill(kind, data, fill_char, start, length);
10273 }
10274
10275 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10276 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10277 Py_UCS4 fill_char)
10278 {
10279 Py_ssize_t maxlen;
10280
10281 if (!PyUnicode_Check(unicode)) {
10282 PyErr_BadInternalCall();
10283 return -1;
10284 }
10285 if (PyUnicode_READY(unicode) == -1)
10286 return -1;
10287 if (unicode_check_modifiable(unicode))
10288 return -1;
10289
10290 if (start < 0) {
10291 PyErr_SetString(PyExc_IndexError, "string index out of range");
10292 return -1;
10293 }
10294 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10295 PyErr_SetString(PyExc_ValueError,
10296 "fill character is bigger than "
10297 "the string maximum character");
10298 return -1;
10299 }
10300
10301 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10302 length = Py_MIN(maxlen, length);
10303 if (length <= 0)
10304 return 0;
10305
10306 _PyUnicode_FastFill(unicode, start, length, fill_char);
10307 return length;
10308 }
10309
10310 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10311 pad(PyObject *self,
10312 Py_ssize_t left,
10313 Py_ssize_t right,
10314 Py_UCS4 fill)
10315 {
10316 PyObject *u;
10317 Py_UCS4 maxchar;
10318 int kind;
10319 void *data;
10320
10321 if (left < 0)
10322 left = 0;
10323 if (right < 0)
10324 right = 0;
10325
10326 if (left == 0 && right == 0)
10327 return unicode_result_unchanged(self);
10328
10329 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10330 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10331 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10332 return NULL;
10333 }
10334 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10335 maxchar = Py_MAX(maxchar, fill);
10336 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10337 if (!u)
10338 return NULL;
10339
10340 kind = PyUnicode_KIND(u);
10341 data = PyUnicode_DATA(u);
10342 if (left)
10343 unicode_fill(kind, data, fill, 0, left);
10344 if (right)
10345 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10346 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10347 assert(_PyUnicode_CheckConsistency(u, 1));
10348 return u;
10349 }
10350
10351 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10352 PyUnicode_Splitlines(PyObject *string, int keepends)
10353 {
10354 PyObject *list;
10355
10356 if (ensure_unicode(string) < 0)
10357 return NULL;
10358
10359 switch (PyUnicode_KIND(string)) {
10360 case PyUnicode_1BYTE_KIND:
10361 if (PyUnicode_IS_ASCII(string))
10362 list = asciilib_splitlines(
10363 string, PyUnicode_1BYTE_DATA(string),
10364 PyUnicode_GET_LENGTH(string), keepends);
10365 else
10366 list = ucs1lib_splitlines(
10367 string, PyUnicode_1BYTE_DATA(string),
10368 PyUnicode_GET_LENGTH(string), keepends);
10369 break;
10370 case PyUnicode_2BYTE_KIND:
10371 list = ucs2lib_splitlines(
10372 string, PyUnicode_2BYTE_DATA(string),
10373 PyUnicode_GET_LENGTH(string), keepends);
10374 break;
10375 case PyUnicode_4BYTE_KIND:
10376 list = ucs4lib_splitlines(
10377 string, PyUnicode_4BYTE_DATA(string),
10378 PyUnicode_GET_LENGTH(string), keepends);
10379 break;
10380 default:
10381 Py_UNREACHABLE();
10382 }
10383 return list;
10384 }
10385
10386 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10387 split(PyObject *self,
10388 PyObject *substring,
10389 Py_ssize_t maxcount)
10390 {
10391 int kind1, kind2;
10392 const void *buf1, *buf2;
10393 Py_ssize_t len1, len2;
10394 PyObject* out;
10395
10396 if (maxcount < 0)
10397 maxcount = PY_SSIZE_T_MAX;
10398
10399 if (PyUnicode_READY(self) == -1)
10400 return NULL;
10401
10402 if (substring == NULL)
10403 switch (PyUnicode_KIND(self)) {
10404 case PyUnicode_1BYTE_KIND:
10405 if (PyUnicode_IS_ASCII(self))
10406 return asciilib_split_whitespace(
10407 self, PyUnicode_1BYTE_DATA(self),
10408 PyUnicode_GET_LENGTH(self), maxcount
10409 );
10410 else
10411 return ucs1lib_split_whitespace(
10412 self, PyUnicode_1BYTE_DATA(self),
10413 PyUnicode_GET_LENGTH(self), maxcount
10414 );
10415 case PyUnicode_2BYTE_KIND:
10416 return ucs2lib_split_whitespace(
10417 self, PyUnicode_2BYTE_DATA(self),
10418 PyUnicode_GET_LENGTH(self), maxcount
10419 );
10420 case PyUnicode_4BYTE_KIND:
10421 return ucs4lib_split_whitespace(
10422 self, PyUnicode_4BYTE_DATA(self),
10423 PyUnicode_GET_LENGTH(self), maxcount
10424 );
10425 default:
10426 Py_UNREACHABLE();
10427 }
10428
10429 if (PyUnicode_READY(substring) == -1)
10430 return NULL;
10431
10432 kind1 = PyUnicode_KIND(self);
10433 kind2 = PyUnicode_KIND(substring);
10434 len1 = PyUnicode_GET_LENGTH(self);
10435 len2 = PyUnicode_GET_LENGTH(substring);
10436 if (kind1 < kind2 || len1 < len2) {
10437 out = PyList_New(1);
10438 if (out == NULL)
10439 return NULL;
10440 Py_INCREF(self);
10441 PyList_SET_ITEM(out, 0, self);
10442 return out;
10443 }
10444 buf1 = PyUnicode_DATA(self);
10445 buf2 = PyUnicode_DATA(substring);
10446 if (kind2 != kind1) {
10447 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10448 if (!buf2)
10449 return NULL;
10450 }
10451
10452 switch (kind1) {
10453 case PyUnicode_1BYTE_KIND:
10454 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10455 out = asciilib_split(
10456 self, buf1, len1, buf2, len2, maxcount);
10457 else
10458 out = ucs1lib_split(
10459 self, buf1, len1, buf2, len2, maxcount);
10460 break;
10461 case PyUnicode_2BYTE_KIND:
10462 out = ucs2lib_split(
10463 self, buf1, len1, buf2, len2, maxcount);
10464 break;
10465 case PyUnicode_4BYTE_KIND:
10466 out = ucs4lib_split(
10467 self, buf1, len1, buf2, len2, maxcount);
10468 break;
10469 default:
10470 out = NULL;
10471 }
10472 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10473 if (kind2 != kind1)
10474 PyMem_Free((void *)buf2);
10475 return out;
10476 }
10477
10478 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10479 rsplit(PyObject *self,
10480 PyObject *substring,
10481 Py_ssize_t maxcount)
10482 {
10483 int kind1, kind2;
10484 const void *buf1, *buf2;
10485 Py_ssize_t len1, len2;
10486 PyObject* out;
10487
10488 if (maxcount < 0)
10489 maxcount = PY_SSIZE_T_MAX;
10490
10491 if (PyUnicode_READY(self) == -1)
10492 return NULL;
10493
10494 if (substring == NULL)
10495 switch (PyUnicode_KIND(self)) {
10496 case PyUnicode_1BYTE_KIND:
10497 if (PyUnicode_IS_ASCII(self))
10498 return asciilib_rsplit_whitespace(
10499 self, PyUnicode_1BYTE_DATA(self),
10500 PyUnicode_GET_LENGTH(self), maxcount
10501 );
10502 else
10503 return ucs1lib_rsplit_whitespace(
10504 self, PyUnicode_1BYTE_DATA(self),
10505 PyUnicode_GET_LENGTH(self), maxcount
10506 );
10507 case PyUnicode_2BYTE_KIND:
10508 return ucs2lib_rsplit_whitespace(
10509 self, PyUnicode_2BYTE_DATA(self),
10510 PyUnicode_GET_LENGTH(self), maxcount
10511 );
10512 case PyUnicode_4BYTE_KIND:
10513 return ucs4lib_rsplit_whitespace(
10514 self, PyUnicode_4BYTE_DATA(self),
10515 PyUnicode_GET_LENGTH(self), maxcount
10516 );
10517 default:
10518 Py_UNREACHABLE();
10519 }
10520
10521 if (PyUnicode_READY(substring) == -1)
10522 return NULL;
10523
10524 kind1 = PyUnicode_KIND(self);
10525 kind2 = PyUnicode_KIND(substring);
10526 len1 = PyUnicode_GET_LENGTH(self);
10527 len2 = PyUnicode_GET_LENGTH(substring);
10528 if (kind1 < kind2 || len1 < len2) {
10529 out = PyList_New(1);
10530 if (out == NULL)
10531 return NULL;
10532 Py_INCREF(self);
10533 PyList_SET_ITEM(out, 0, self);
10534 return out;
10535 }
10536 buf1 = PyUnicode_DATA(self);
10537 buf2 = PyUnicode_DATA(substring);
10538 if (kind2 != kind1) {
10539 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10540 if (!buf2)
10541 return NULL;
10542 }
10543
10544 switch (kind1) {
10545 case PyUnicode_1BYTE_KIND:
10546 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10547 out = asciilib_rsplit(
10548 self, buf1, len1, buf2, len2, maxcount);
10549 else
10550 out = ucs1lib_rsplit(
10551 self, buf1, len1, buf2, len2, maxcount);
10552 break;
10553 case PyUnicode_2BYTE_KIND:
10554 out = ucs2lib_rsplit(
10555 self, buf1, len1, buf2, len2, maxcount);
10556 break;
10557 case PyUnicode_4BYTE_KIND:
10558 out = ucs4lib_rsplit(
10559 self, buf1, len1, buf2, len2, maxcount);
10560 break;
10561 default:
10562 out = NULL;
10563 }
10564 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10565 if (kind2 != kind1)
10566 PyMem_Free((void *)buf2);
10567 return out;
10568 }
10569
10570 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10571 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10572 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10573 {
10574 switch (kind) {
10575 case PyUnicode_1BYTE_KIND:
10576 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10577 return asciilib_find(buf1, len1, buf2, len2, offset);
10578 else
10579 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10580 case PyUnicode_2BYTE_KIND:
10581 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10582 case PyUnicode_4BYTE_KIND:
10583 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10584 }
10585 Py_UNREACHABLE();
10586 }
10587
10588 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10589 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10590 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10591 {
10592 switch (kind) {
10593 case PyUnicode_1BYTE_KIND:
10594 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10595 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10596 else
10597 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10598 case PyUnicode_2BYTE_KIND:
10599 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10600 case PyUnicode_4BYTE_KIND:
10601 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10602 }
10603 Py_UNREACHABLE();
10604 }
10605
10606 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10607 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10608 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10609 {
10610 int kind = PyUnicode_KIND(u);
10611 void *data = PyUnicode_DATA(u);
10612 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10613 if (kind == PyUnicode_1BYTE_KIND) {
10614 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10615 (Py_UCS1 *)data + len,
10616 u1, u2, maxcount);
10617 }
10618 else if (kind == PyUnicode_2BYTE_KIND) {
10619 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10620 (Py_UCS2 *)data + len,
10621 u1, u2, maxcount);
10622 }
10623 else {
10624 assert(kind == PyUnicode_4BYTE_KIND);
10625 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10626 (Py_UCS4 *)data + len,
10627 u1, u2, maxcount);
10628 }
10629 }
10630
10631 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10632 replace(PyObject *self, PyObject *str1,
10633 PyObject *str2, Py_ssize_t maxcount)
10634 {
10635 PyObject *u;
10636 const char *sbuf = PyUnicode_DATA(self);
10637 const void *buf1 = PyUnicode_DATA(str1);
10638 const void *buf2 = PyUnicode_DATA(str2);
10639 int srelease = 0, release1 = 0, release2 = 0;
10640 int skind = PyUnicode_KIND(self);
10641 int kind1 = PyUnicode_KIND(str1);
10642 int kind2 = PyUnicode_KIND(str2);
10643 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10644 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10645 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10646 int mayshrink;
10647 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10648
10649 if (slen < len1)
10650 goto nothing;
10651
10652 if (maxcount < 0)
10653 maxcount = PY_SSIZE_T_MAX;
10654 else if (maxcount == 0)
10655 goto nothing;
10656
10657 if (str1 == str2)
10658 goto nothing;
10659
10660 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10661 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10662 if (maxchar < maxchar_str1)
10663 /* substring too wide to be present */
10664 goto nothing;
10665 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10666 /* Replacing str1 with str2 may cause a maxchar reduction in the
10667 result string. */
10668 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10669 maxchar = Py_MAX(maxchar, maxchar_str2);
10670
10671 if (len1 == len2) {
10672 /* same length */
10673 if (len1 == 0)
10674 goto nothing;
10675 if (len1 == 1) {
10676 /* replace characters */
10677 Py_UCS4 u1, u2;
10678 Py_ssize_t pos;
10679
10680 u1 = PyUnicode_READ(kind1, buf1, 0);
10681 pos = findchar(sbuf, skind, slen, u1, 1);
10682 if (pos < 0)
10683 goto nothing;
10684 u2 = PyUnicode_READ(kind2, buf2, 0);
10685 u = PyUnicode_New(slen, maxchar);
10686 if (!u)
10687 goto error;
10688
10689 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10690 replace_1char_inplace(u, pos, u1, u2, maxcount);
10691 }
10692 else {
10693 int rkind = skind;
10694 char *res;
10695 Py_ssize_t i;
10696
10697 if (kind1 < rkind) {
10698 /* widen substring */
10699 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10700 if (!buf1) goto error;
10701 release1 = 1;
10702 }
10703 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10704 if (i < 0)
10705 goto nothing;
10706 if (rkind > kind2) {
10707 /* widen replacement */
10708 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10709 if (!buf2) goto error;
10710 release2 = 1;
10711 }
10712 else if (rkind < kind2) {
10713 /* widen self and buf1 */
10714 rkind = kind2;
10715 if (release1) {
10716 assert(buf1 != PyUnicode_DATA(str1));
10717 PyMem_Free((void *)buf1);
10718 buf1 = PyUnicode_DATA(str1);
10719 release1 = 0;
10720 }
10721 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10722 if (!sbuf) goto error;
10723 srelease = 1;
10724 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10725 if (!buf1) goto error;
10726 release1 = 1;
10727 }
10728 u = PyUnicode_New(slen, maxchar);
10729 if (!u)
10730 goto error;
10731 assert(PyUnicode_KIND(u) == rkind);
10732 res = PyUnicode_DATA(u);
10733
10734 memcpy(res, sbuf, rkind * slen);
10735 /* change everything in-place, starting with this one */
10736 memcpy(res + rkind * i,
10737 buf2,
10738 rkind * len2);
10739 i += len1;
10740
10741 while ( --maxcount > 0) {
10742 i = anylib_find(rkind, self,
10743 sbuf+rkind*i, slen-i,
10744 str1, buf1, len1, i);
10745 if (i == -1)
10746 break;
10747 memcpy(res + rkind * i,
10748 buf2,
10749 rkind * len2);
10750 i += len1;
10751 }
10752 }
10753 }
10754 else {
10755 Py_ssize_t n, i, j, ires;
10756 Py_ssize_t new_size;
10757 int rkind = skind;
10758 char *res;
10759
10760 if (kind1 < rkind) {
10761 /* widen substring */
10762 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10763 if (!buf1) goto error;
10764 release1 = 1;
10765 }
10766 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10767 if (n == 0)
10768 goto nothing;
10769 if (kind2 < rkind) {
10770 /* widen replacement */
10771 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10772 if (!buf2) goto error;
10773 release2 = 1;
10774 }
10775 else if (kind2 > rkind) {
10776 /* widen self and buf1 */
10777 rkind = kind2;
10778 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10779 if (!sbuf) goto error;
10780 srelease = 1;
10781 if (release1) {
10782 assert(buf1 != PyUnicode_DATA(str1));
10783 PyMem_Free((void *)buf1);
10784 buf1 = PyUnicode_DATA(str1);
10785 release1 = 0;
10786 }
10787 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10788 if (!buf1) goto error;
10789 release1 = 1;
10790 }
10791 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10792 PyUnicode_GET_LENGTH(str1)); */
10793 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10794 PyErr_SetString(PyExc_OverflowError,
10795 "replace string is too long");
10796 goto error;
10797 }
10798 new_size = slen + n * (len2 - len1);
10799 if (new_size == 0) {
10800 u = unicode_new_empty();
10801 goto done;
10802 }
10803 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10804 PyErr_SetString(PyExc_OverflowError,
10805 "replace string is too long");
10806 goto error;
10807 }
10808 u = PyUnicode_New(new_size, maxchar);
10809 if (!u)
10810 goto error;
10811 assert(PyUnicode_KIND(u) == rkind);
10812 res = PyUnicode_DATA(u);
10813 ires = i = 0;
10814 if (len1 > 0) {
10815 while (n-- > 0) {
10816 /* look for next match */
10817 j = anylib_find(rkind, self,
10818 sbuf + rkind * i, slen-i,
10819 str1, buf1, len1, i);
10820 if (j == -1)
10821 break;
10822 else if (j > i) {
10823 /* copy unchanged part [i:j] */
10824 memcpy(res + rkind * ires,
10825 sbuf + rkind * i,
10826 rkind * (j-i));
10827 ires += j - i;
10828 }
10829 /* copy substitution string */
10830 if (len2 > 0) {
10831 memcpy(res + rkind * ires,
10832 buf2,
10833 rkind * len2);
10834 ires += len2;
10835 }
10836 i = j + len1;
10837 }
10838 if (i < slen)
10839 /* copy tail [i:] */
10840 memcpy(res + rkind * ires,
10841 sbuf + rkind * i,
10842 rkind * (slen-i));
10843 }
10844 else {
10845 /* interleave */
10846 while (n > 0) {
10847 memcpy(res + rkind * ires,
10848 buf2,
10849 rkind * len2);
10850 ires += len2;
10851 if (--n <= 0)
10852 break;
10853 memcpy(res + rkind * ires,
10854 sbuf + rkind * i,
10855 rkind);
10856 ires++;
10857 i++;
10858 }
10859 memcpy(res + rkind * ires,
10860 sbuf + rkind * i,
10861 rkind * (slen-i));
10862 }
10863 }
10864
10865 if (mayshrink) {
10866 unicode_adjust_maxchar(&u);
10867 if (u == NULL)
10868 goto error;
10869 }
10870
10871 done:
10872 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10873 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10874 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10875 if (srelease)
10876 PyMem_Free((void *)sbuf);
10877 if (release1)
10878 PyMem_Free((void *)buf1);
10879 if (release2)
10880 PyMem_Free((void *)buf2);
10881 assert(_PyUnicode_CheckConsistency(u, 1));
10882 return u;
10883
10884 nothing:
10885 /* nothing to replace; return original string (when possible) */
10886 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10887 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10888 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10889 if (srelease)
10890 PyMem_Free((void *)sbuf);
10891 if (release1)
10892 PyMem_Free((void *)buf1);
10893 if (release2)
10894 PyMem_Free((void *)buf2);
10895 return unicode_result_unchanged(self);
10896
10897 error:
10898 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10899 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10900 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10901 if (srelease)
10902 PyMem_Free((void *)sbuf);
10903 if (release1)
10904 PyMem_Free((void *)buf1);
10905 if (release2)
10906 PyMem_Free((void *)buf2);
10907 return NULL;
10908 }
10909
10910 /* --- Unicode Object Methods --------------------------------------------- */
10911
10912 /*[clinic input]
10913 str.title as unicode_title
10914
10915 Return a version of the string where each word is titlecased.
10916
10917 More specifically, words start with uppercased characters and all remaining
10918 cased characters have lower case.
10919 [clinic start generated code]*/
10920
10921 static PyObject *
unicode_title_impl(PyObject * self)10922 unicode_title_impl(PyObject *self)
10923 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10924 {
10925 if (PyUnicode_READY(self) == -1)
10926 return NULL;
10927 return case_operation(self, do_title);
10928 }
10929
10930 /*[clinic input]
10931 str.capitalize as unicode_capitalize
10932
10933 Return a capitalized version of the string.
10934
10935 More specifically, make the first character have upper case and the rest lower
10936 case.
10937 [clinic start generated code]*/
10938
10939 static PyObject *
unicode_capitalize_impl(PyObject * self)10940 unicode_capitalize_impl(PyObject *self)
10941 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10942 {
10943 if (PyUnicode_READY(self) == -1)
10944 return NULL;
10945 if (PyUnicode_GET_LENGTH(self) == 0)
10946 return unicode_result_unchanged(self);
10947 return case_operation(self, do_capitalize);
10948 }
10949
10950 /*[clinic input]
10951 str.casefold as unicode_casefold
10952
10953 Return a version of the string suitable for caseless comparisons.
10954 [clinic start generated code]*/
10955
10956 static PyObject *
unicode_casefold_impl(PyObject * self)10957 unicode_casefold_impl(PyObject *self)
10958 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10959 {
10960 if (PyUnicode_READY(self) == -1)
10961 return NULL;
10962 if (PyUnicode_IS_ASCII(self))
10963 return ascii_upper_or_lower(self, 1);
10964 return case_operation(self, do_casefold);
10965 }
10966
10967
10968 /* Argument converter. Accepts a single Unicode character. */
10969
10970 static int
convert_uc(PyObject * obj,void * addr)10971 convert_uc(PyObject *obj, void *addr)
10972 {
10973 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10974
10975 if (!PyUnicode_Check(obj)) {
10976 PyErr_Format(PyExc_TypeError,
10977 "The fill character must be a unicode character, "
10978 "not %.100s", Py_TYPE(obj)->tp_name);
10979 return 0;
10980 }
10981 if (PyUnicode_READY(obj) < 0)
10982 return 0;
10983 if (PyUnicode_GET_LENGTH(obj) != 1) {
10984 PyErr_SetString(PyExc_TypeError,
10985 "The fill character must be exactly one character long");
10986 return 0;
10987 }
10988 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10989 return 1;
10990 }
10991
10992 /*[clinic input]
10993 str.center as unicode_center
10994
10995 width: Py_ssize_t
10996 fillchar: Py_UCS4 = ' '
10997 /
10998
10999 Return a centered string of length width.
11000
11001 Padding is done using the specified fill character (default is a space).
11002 [clinic start generated code]*/
11003
11004 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11005 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11006 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11007 {
11008 Py_ssize_t marg, left;
11009
11010 if (PyUnicode_READY(self) == -1)
11011 return NULL;
11012
11013 if (PyUnicode_GET_LENGTH(self) >= width)
11014 return unicode_result_unchanged(self);
11015
11016 marg = width - PyUnicode_GET_LENGTH(self);
11017 left = marg / 2 + (marg & width & 1);
11018
11019 return pad(self, left, marg - left, fillchar);
11020 }
11021
11022 /* This function assumes that str1 and str2 are readied by the caller. */
11023
11024 static int
unicode_compare(PyObject * str1,PyObject * str2)11025 unicode_compare(PyObject *str1, PyObject *str2)
11026 {
11027 #define COMPARE(TYPE1, TYPE2) \
11028 do { \
11029 TYPE1* p1 = (TYPE1 *)data1; \
11030 TYPE2* p2 = (TYPE2 *)data2; \
11031 TYPE1* end = p1 + len; \
11032 Py_UCS4 c1, c2; \
11033 for (; p1 != end; p1++, p2++) { \
11034 c1 = *p1; \
11035 c2 = *p2; \
11036 if (c1 != c2) \
11037 return (c1 < c2) ? -1 : 1; \
11038 } \
11039 } \
11040 while (0)
11041
11042 int kind1, kind2;
11043 const void *data1, *data2;
11044 Py_ssize_t len1, len2, len;
11045
11046 kind1 = PyUnicode_KIND(str1);
11047 kind2 = PyUnicode_KIND(str2);
11048 data1 = PyUnicode_DATA(str1);
11049 data2 = PyUnicode_DATA(str2);
11050 len1 = PyUnicode_GET_LENGTH(str1);
11051 len2 = PyUnicode_GET_LENGTH(str2);
11052 len = Py_MIN(len1, len2);
11053
11054 switch(kind1) {
11055 case PyUnicode_1BYTE_KIND:
11056 {
11057 switch(kind2) {
11058 case PyUnicode_1BYTE_KIND:
11059 {
11060 int cmp = memcmp(data1, data2, len);
11061 /* normalize result of memcmp() into the range [-1; 1] */
11062 if (cmp < 0)
11063 return -1;
11064 if (cmp > 0)
11065 return 1;
11066 break;
11067 }
11068 case PyUnicode_2BYTE_KIND:
11069 COMPARE(Py_UCS1, Py_UCS2);
11070 break;
11071 case PyUnicode_4BYTE_KIND:
11072 COMPARE(Py_UCS1, Py_UCS4);
11073 break;
11074 default:
11075 Py_UNREACHABLE();
11076 }
11077 break;
11078 }
11079 case PyUnicode_2BYTE_KIND:
11080 {
11081 switch(kind2) {
11082 case PyUnicode_1BYTE_KIND:
11083 COMPARE(Py_UCS2, Py_UCS1);
11084 break;
11085 case PyUnicode_2BYTE_KIND:
11086 {
11087 COMPARE(Py_UCS2, Py_UCS2);
11088 break;
11089 }
11090 case PyUnicode_4BYTE_KIND:
11091 COMPARE(Py_UCS2, Py_UCS4);
11092 break;
11093 default:
11094 Py_UNREACHABLE();
11095 }
11096 break;
11097 }
11098 case PyUnicode_4BYTE_KIND:
11099 {
11100 switch(kind2) {
11101 case PyUnicode_1BYTE_KIND:
11102 COMPARE(Py_UCS4, Py_UCS1);
11103 break;
11104 case PyUnicode_2BYTE_KIND:
11105 COMPARE(Py_UCS4, Py_UCS2);
11106 break;
11107 case PyUnicode_4BYTE_KIND:
11108 {
11109 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11110 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11111 /* normalize result of wmemcmp() into the range [-1; 1] */
11112 if (cmp < 0)
11113 return -1;
11114 if (cmp > 0)
11115 return 1;
11116 #else
11117 COMPARE(Py_UCS4, Py_UCS4);
11118 #endif
11119 break;
11120 }
11121 default:
11122 Py_UNREACHABLE();
11123 }
11124 break;
11125 }
11126 default:
11127 Py_UNREACHABLE();
11128 }
11129
11130 if (len1 == len2)
11131 return 0;
11132 if (len1 < len2)
11133 return -1;
11134 else
11135 return 1;
11136
11137 #undef COMPARE
11138 }
11139
11140 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11141 unicode_compare_eq(PyObject *str1, PyObject *str2)
11142 {
11143 int kind;
11144 const void *data1, *data2;
11145 Py_ssize_t len;
11146 int cmp;
11147
11148 len = PyUnicode_GET_LENGTH(str1);
11149 if (PyUnicode_GET_LENGTH(str2) != len)
11150 return 0;
11151 kind = PyUnicode_KIND(str1);
11152 if (PyUnicode_KIND(str2) != kind)
11153 return 0;
11154 data1 = PyUnicode_DATA(str1);
11155 data2 = PyUnicode_DATA(str2);
11156
11157 cmp = memcmp(data1, data2, len * kind);
11158 return (cmp == 0);
11159 }
11160
11161 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)11162 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
11163 {
11164 assert(PyUnicode_Check(str1));
11165 assert(PyUnicode_Check(str2));
11166 if (str1 == str2) {
11167 return 1;
11168 }
11169 if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11170 return -1;
11171 }
11172 return unicode_compare_eq(str1, str2);
11173 }
11174
11175
11176 int
PyUnicode_Compare(PyObject * left,PyObject * right)11177 PyUnicode_Compare(PyObject *left, PyObject *right)
11178 {
11179 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11180 if (PyUnicode_READY(left) == -1 ||
11181 PyUnicode_READY(right) == -1)
11182 return -1;
11183
11184 /* a string is equal to itself */
11185 if (left == right)
11186 return 0;
11187
11188 return unicode_compare(left, right);
11189 }
11190 PyErr_Format(PyExc_TypeError,
11191 "Can't compare %.100s and %.100s",
11192 Py_TYPE(left)->tp_name,
11193 Py_TYPE(right)->tp_name);
11194 return -1;
11195 }
11196
11197 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11198 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11199 {
11200 Py_ssize_t i;
11201 int kind;
11202 Py_UCS4 chr;
11203 const unsigned char *ustr = (const unsigned char *)str;
11204
11205 assert(_PyUnicode_CHECK(uni));
11206 if (!PyUnicode_IS_READY(uni)) {
11207 const wchar_t *ws = _PyUnicode_WSTR(uni);
11208 /* Compare Unicode string and source character set string */
11209 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11210 if (chr != ustr[i])
11211 return (chr < ustr[i]) ? -1 : 1;
11212 }
11213 /* This check keeps Python strings that end in '\0' from comparing equal
11214 to C strings identical up to that point. */
11215 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11216 return 1; /* uni is longer */
11217 if (ustr[i])
11218 return -1; /* str is longer */
11219 return 0;
11220 }
11221 kind = PyUnicode_KIND(uni);
11222 if (kind == PyUnicode_1BYTE_KIND) {
11223 const void *data = PyUnicode_1BYTE_DATA(uni);
11224 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11225 size_t len, len2 = strlen(str);
11226 int cmp;
11227
11228 len = Py_MIN(len1, len2);
11229 cmp = memcmp(data, str, len);
11230 if (cmp != 0) {
11231 if (cmp < 0)
11232 return -1;
11233 else
11234 return 1;
11235 }
11236 if (len1 > len2)
11237 return 1; /* uni is longer */
11238 if (len1 < len2)
11239 return -1; /* str is longer */
11240 return 0;
11241 }
11242 else {
11243 const void *data = PyUnicode_DATA(uni);
11244 /* Compare Unicode string and source character set string */
11245 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11246 if (chr != (unsigned char)str[i])
11247 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11248 /* This check keeps Python strings that end in '\0' from comparing equal
11249 to C strings identical up to that point. */
11250 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11251 return 1; /* uni is longer */
11252 if (str[i])
11253 return -1; /* str is longer */
11254 return 0;
11255 }
11256 }
11257
11258 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11259 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11260 {
11261 size_t i, len;
11262 const wchar_t *p;
11263 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11264 if (strlen(str) != len)
11265 return 0;
11266 p = _PyUnicode_WSTR(unicode);
11267 assert(p);
11268 for (i = 0; i < len; i++) {
11269 unsigned char c = (unsigned char)str[i];
11270 if (c >= 128 || p[i] != (wchar_t)c)
11271 return 0;
11272 }
11273 return 1;
11274 }
11275
11276 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11277 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11278 {
11279 size_t len;
11280 assert(_PyUnicode_CHECK(unicode));
11281 assert(str);
11282 #ifndef NDEBUG
11283 for (const char *p = str; *p; p++) {
11284 assert((unsigned char)*p < 128);
11285 }
11286 #endif
11287 if (PyUnicode_READY(unicode) == -1) {
11288 /* Memory error or bad data */
11289 PyErr_Clear();
11290 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11291 }
11292 if (!PyUnicode_IS_ASCII(unicode))
11293 return 0;
11294 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11295 return strlen(str) == len &&
11296 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11297 }
11298
11299 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11300 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11301 {
11302 PyObject *right_uni;
11303
11304 assert(_PyUnicode_CHECK(left));
11305 assert(right->string);
11306 #ifndef NDEBUG
11307 for (const char *p = right->string; *p; p++) {
11308 assert((unsigned char)*p < 128);
11309 }
11310 #endif
11311
11312 if (PyUnicode_READY(left) == -1) {
11313 /* memory error or bad data */
11314 PyErr_Clear();
11315 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11316 }
11317
11318 if (!PyUnicode_IS_ASCII(left))
11319 return 0;
11320
11321 right_uni = _PyUnicode_FromId(right); /* borrowed */
11322 if (right_uni == NULL) {
11323 /* memory error or bad data */
11324 PyErr_Clear();
11325 return _PyUnicode_EqualToASCIIString(left, right->string);
11326 }
11327
11328 if (left == right_uni)
11329 return 1;
11330
11331 if (PyUnicode_CHECK_INTERNED(left))
11332 return 0;
11333
11334 assert(_PyUnicode_HASH(right_uni) != -1);
11335 Py_hash_t hash = _PyUnicode_HASH(left);
11336 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11337 return 0;
11338 }
11339
11340 return unicode_compare_eq(left, right_uni);
11341 }
11342
11343 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11344 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11345 {
11346 int result;
11347
11348 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11349 Py_RETURN_NOTIMPLEMENTED;
11350
11351 if (PyUnicode_READY(left) == -1 ||
11352 PyUnicode_READY(right) == -1)
11353 return NULL;
11354
11355 if (left == right) {
11356 switch (op) {
11357 case Py_EQ:
11358 case Py_LE:
11359 case Py_GE:
11360 /* a string is equal to itself */
11361 Py_RETURN_TRUE;
11362 case Py_NE:
11363 case Py_LT:
11364 case Py_GT:
11365 Py_RETURN_FALSE;
11366 default:
11367 PyErr_BadArgument();
11368 return NULL;
11369 }
11370 }
11371 else if (op == Py_EQ || op == Py_NE) {
11372 result = unicode_compare_eq(left, right);
11373 result ^= (op == Py_NE);
11374 return PyBool_FromLong(result);
11375 }
11376 else {
11377 result = unicode_compare(left, right);
11378 Py_RETURN_RICHCOMPARE(result, 0, op);
11379 }
11380 }
11381
11382 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11383 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11384 {
11385 return unicode_eq(aa, bb);
11386 }
11387
11388 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11389 PyUnicode_Contains(PyObject *str, PyObject *substr)
11390 {
11391 int kind1, kind2;
11392 const void *buf1, *buf2;
11393 Py_ssize_t len1, len2;
11394 int result;
11395
11396 if (!PyUnicode_Check(substr)) {
11397 PyErr_Format(PyExc_TypeError,
11398 "'in <string>' requires string as left operand, not %.100s",
11399 Py_TYPE(substr)->tp_name);
11400 return -1;
11401 }
11402 if (PyUnicode_READY(substr) == -1)
11403 return -1;
11404 if (ensure_unicode(str) < 0)
11405 return -1;
11406
11407 kind1 = PyUnicode_KIND(str);
11408 kind2 = PyUnicode_KIND(substr);
11409 if (kind1 < kind2)
11410 return 0;
11411 len1 = PyUnicode_GET_LENGTH(str);
11412 len2 = PyUnicode_GET_LENGTH(substr);
11413 if (len1 < len2)
11414 return 0;
11415 buf1 = PyUnicode_DATA(str);
11416 buf2 = PyUnicode_DATA(substr);
11417 if (len2 == 1) {
11418 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11419 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11420 return result;
11421 }
11422 if (kind2 != kind1) {
11423 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11424 if (!buf2)
11425 return -1;
11426 }
11427
11428 switch (kind1) {
11429 case PyUnicode_1BYTE_KIND:
11430 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11431 break;
11432 case PyUnicode_2BYTE_KIND:
11433 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11434 break;
11435 case PyUnicode_4BYTE_KIND:
11436 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11437 break;
11438 default:
11439 Py_UNREACHABLE();
11440 }
11441
11442 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11443 if (kind2 != kind1)
11444 PyMem_Free((void *)buf2);
11445
11446 return result;
11447 }
11448
11449 /* Concat to string or Unicode object giving a new Unicode object. */
11450
11451 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11452 PyUnicode_Concat(PyObject *left, PyObject *right)
11453 {
11454 PyObject *result;
11455 Py_UCS4 maxchar, maxchar2;
11456 Py_ssize_t left_len, right_len, new_len;
11457
11458 if (ensure_unicode(left) < 0)
11459 return NULL;
11460
11461 if (!PyUnicode_Check(right)) {
11462 PyErr_Format(PyExc_TypeError,
11463 "can only concatenate str (not \"%.200s\") to str",
11464 Py_TYPE(right)->tp_name);
11465 return NULL;
11466 }
11467 if (PyUnicode_READY(right) < 0)
11468 return NULL;
11469
11470 /* Shortcuts */
11471 PyObject *empty = unicode_get_empty(); // Borrowed reference
11472 if (left == empty) {
11473 return PyUnicode_FromObject(right);
11474 }
11475 if (right == empty) {
11476 return PyUnicode_FromObject(left);
11477 }
11478
11479 left_len = PyUnicode_GET_LENGTH(left);
11480 right_len = PyUnicode_GET_LENGTH(right);
11481 if (left_len > PY_SSIZE_T_MAX - right_len) {
11482 PyErr_SetString(PyExc_OverflowError,
11483 "strings are too large to concat");
11484 return NULL;
11485 }
11486 new_len = left_len + right_len;
11487
11488 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11489 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11490 maxchar = Py_MAX(maxchar, maxchar2);
11491
11492 /* Concat the two Unicode strings */
11493 result = PyUnicode_New(new_len, maxchar);
11494 if (result == NULL)
11495 return NULL;
11496 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11497 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11498 assert(_PyUnicode_CheckConsistency(result, 1));
11499 return result;
11500 }
11501
11502 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11503 PyUnicode_Append(PyObject **p_left, PyObject *right)
11504 {
11505 PyObject *left, *res;
11506 Py_UCS4 maxchar, maxchar2;
11507 Py_ssize_t left_len, right_len, new_len;
11508
11509 if (p_left == NULL) {
11510 if (!PyErr_Occurred())
11511 PyErr_BadInternalCall();
11512 return;
11513 }
11514 left = *p_left;
11515 if (right == NULL || left == NULL
11516 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11517 if (!PyErr_Occurred())
11518 PyErr_BadInternalCall();
11519 goto error;
11520 }
11521
11522 if (PyUnicode_READY(left) == -1)
11523 goto error;
11524 if (PyUnicode_READY(right) == -1)
11525 goto error;
11526
11527 /* Shortcuts */
11528 PyObject *empty = unicode_get_empty(); // Borrowed reference
11529 if (left == empty) {
11530 Py_DECREF(left);
11531 Py_INCREF(right);
11532 *p_left = right;
11533 return;
11534 }
11535 if (right == empty) {
11536 return;
11537 }
11538
11539 left_len = PyUnicode_GET_LENGTH(left);
11540 right_len = PyUnicode_GET_LENGTH(right);
11541 if (left_len > PY_SSIZE_T_MAX - right_len) {
11542 PyErr_SetString(PyExc_OverflowError,
11543 "strings are too large to concat");
11544 goto error;
11545 }
11546 new_len = left_len + right_len;
11547
11548 if (unicode_modifiable(left)
11549 && PyUnicode_CheckExact(right)
11550 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11551 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11552 to change the structure size, but characters are stored just after
11553 the structure, and so it requires to move all characters which is
11554 not so different than duplicating the string. */
11555 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11556 {
11557 /* append inplace */
11558 if (unicode_resize(p_left, new_len) != 0)
11559 goto error;
11560
11561 /* copy 'right' into the newly allocated area of 'left' */
11562 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11563 }
11564 else {
11565 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11566 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11567 maxchar = Py_MAX(maxchar, maxchar2);
11568
11569 /* Concat the two Unicode strings */
11570 res = PyUnicode_New(new_len, maxchar);
11571 if (res == NULL)
11572 goto error;
11573 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11574 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11575 Py_DECREF(left);
11576 *p_left = res;
11577 }
11578 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11579 return;
11580
11581 error:
11582 Py_CLEAR(*p_left);
11583 }
11584
11585 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11586 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11587 {
11588 PyUnicode_Append(pleft, right);
11589 Py_XDECREF(right);
11590 }
11591
11592 /*
11593 Wraps stringlib_parse_args_finds() and additionally ensures that the
11594 first argument is a unicode object.
11595 */
11596
11597 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11598 parse_args_finds_unicode(const char * function_name, PyObject *args,
11599 PyObject **substring,
11600 Py_ssize_t *start, Py_ssize_t *end)
11601 {
11602 if(stringlib_parse_args_finds(function_name, args, substring,
11603 start, end)) {
11604 if (ensure_unicode(*substring) < 0)
11605 return 0;
11606 return 1;
11607 }
11608 return 0;
11609 }
11610
11611 PyDoc_STRVAR(count__doc__,
11612 "S.count(sub[, start[, end]]) -> int\n\
11613 \n\
11614 Return the number of non-overlapping occurrences of substring sub in\n\
11615 string S[start:end]. Optional arguments start and end are\n\
11616 interpreted as in slice notation.");
11617
11618 static PyObject *
unicode_count(PyObject * self,PyObject * args)11619 unicode_count(PyObject *self, PyObject *args)
11620 {
11621 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11622 Py_ssize_t start = 0;
11623 Py_ssize_t end = PY_SSIZE_T_MAX;
11624 PyObject *result;
11625 int kind1, kind2;
11626 const void *buf1, *buf2;
11627 Py_ssize_t len1, len2, iresult;
11628
11629 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11630 return NULL;
11631
11632 kind1 = PyUnicode_KIND(self);
11633 kind2 = PyUnicode_KIND(substring);
11634 if (kind1 < kind2)
11635 return PyLong_FromLong(0);
11636
11637 len1 = PyUnicode_GET_LENGTH(self);
11638 len2 = PyUnicode_GET_LENGTH(substring);
11639 ADJUST_INDICES(start, end, len1);
11640 if (end - start < len2)
11641 return PyLong_FromLong(0);
11642
11643 buf1 = PyUnicode_DATA(self);
11644 buf2 = PyUnicode_DATA(substring);
11645 if (kind2 != kind1) {
11646 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11647 if (!buf2)
11648 return NULL;
11649 }
11650 switch (kind1) {
11651 case PyUnicode_1BYTE_KIND:
11652 iresult = ucs1lib_count(
11653 ((const Py_UCS1*)buf1) + start, end - start,
11654 buf2, len2, PY_SSIZE_T_MAX
11655 );
11656 break;
11657 case PyUnicode_2BYTE_KIND:
11658 iresult = ucs2lib_count(
11659 ((const Py_UCS2*)buf1) + start, end - start,
11660 buf2, len2, PY_SSIZE_T_MAX
11661 );
11662 break;
11663 case PyUnicode_4BYTE_KIND:
11664 iresult = ucs4lib_count(
11665 ((const Py_UCS4*)buf1) + start, end - start,
11666 buf2, len2, PY_SSIZE_T_MAX
11667 );
11668 break;
11669 default:
11670 Py_UNREACHABLE();
11671 }
11672
11673 result = PyLong_FromSsize_t(iresult);
11674
11675 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11676 if (kind2 != kind1)
11677 PyMem_Free((void *)buf2);
11678
11679 return result;
11680 }
11681
11682 /*[clinic input]
11683 str.encode as unicode_encode
11684
11685 encoding: str(c_default="NULL") = 'utf-8'
11686 The encoding in which to encode the string.
11687 errors: str(c_default="NULL") = 'strict'
11688 The error handling scheme to use for encoding errors.
11689 The default is 'strict' meaning that encoding errors raise a
11690 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11691 'xmlcharrefreplace' as well as any other name registered with
11692 codecs.register_error that can handle UnicodeEncodeErrors.
11693
11694 Encode the string using the codec registered for encoding.
11695 [clinic start generated code]*/
11696
11697 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11698 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11699 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11700 {
11701 return PyUnicode_AsEncodedString(self, encoding, errors);
11702 }
11703
11704 /*[clinic input]
11705 str.expandtabs as unicode_expandtabs
11706
11707 tabsize: int = 8
11708
11709 Return a copy where all tab characters are expanded using spaces.
11710
11711 If tabsize is not given, a tab size of 8 characters is assumed.
11712 [clinic start generated code]*/
11713
11714 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11715 unicode_expandtabs_impl(PyObject *self, int tabsize)
11716 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11717 {
11718 Py_ssize_t i, j, line_pos, src_len, incr;
11719 Py_UCS4 ch;
11720 PyObject *u;
11721 const void *src_data;
11722 void *dest_data;
11723 int kind;
11724 int found;
11725
11726 if (PyUnicode_READY(self) == -1)
11727 return NULL;
11728
11729 /* First pass: determine size of output string */
11730 src_len = PyUnicode_GET_LENGTH(self);
11731 i = j = line_pos = 0;
11732 kind = PyUnicode_KIND(self);
11733 src_data = PyUnicode_DATA(self);
11734 found = 0;
11735 for (; i < src_len; i++) {
11736 ch = PyUnicode_READ(kind, src_data, i);
11737 if (ch == '\t') {
11738 found = 1;
11739 if (tabsize > 0) {
11740 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11741 if (j > PY_SSIZE_T_MAX - incr)
11742 goto overflow;
11743 line_pos += incr;
11744 j += incr;
11745 }
11746 }
11747 else {
11748 if (j > PY_SSIZE_T_MAX - 1)
11749 goto overflow;
11750 line_pos++;
11751 j++;
11752 if (ch == '\n' || ch == '\r')
11753 line_pos = 0;
11754 }
11755 }
11756 if (!found)
11757 return unicode_result_unchanged(self);
11758
11759 /* Second pass: create output string and fill it */
11760 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11761 if (!u)
11762 return NULL;
11763 dest_data = PyUnicode_DATA(u);
11764
11765 i = j = line_pos = 0;
11766
11767 for (; i < src_len; i++) {
11768 ch = PyUnicode_READ(kind, src_data, i);
11769 if (ch == '\t') {
11770 if (tabsize > 0) {
11771 incr = tabsize - (line_pos % tabsize);
11772 line_pos += incr;
11773 unicode_fill(kind, dest_data, ' ', j, incr);
11774 j += incr;
11775 }
11776 }
11777 else {
11778 line_pos++;
11779 PyUnicode_WRITE(kind, dest_data, j, ch);
11780 j++;
11781 if (ch == '\n' || ch == '\r')
11782 line_pos = 0;
11783 }
11784 }
11785 assert (j == PyUnicode_GET_LENGTH(u));
11786 return unicode_result(u);
11787
11788 overflow:
11789 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11790 return NULL;
11791 }
11792
11793 PyDoc_STRVAR(find__doc__,
11794 "S.find(sub[, start[, end]]) -> int\n\
11795 \n\
11796 Return the lowest index in S where substring sub is found,\n\
11797 such that sub is contained within S[start:end]. Optional\n\
11798 arguments start and end are interpreted as in slice notation.\n\
11799 \n\
11800 Return -1 on failure.");
11801
11802 static PyObject *
unicode_find(PyObject * self,PyObject * args)11803 unicode_find(PyObject *self, PyObject *args)
11804 {
11805 /* initialize variables to prevent gcc warning */
11806 PyObject *substring = NULL;
11807 Py_ssize_t start = 0;
11808 Py_ssize_t end = 0;
11809 Py_ssize_t result;
11810
11811 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11812 return NULL;
11813
11814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816
11817 result = any_find_slice(self, substring, start, end, 1);
11818
11819 if (result == -2)
11820 return NULL;
11821
11822 return PyLong_FromSsize_t(result);
11823 }
11824
11825 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11826 unicode_getitem(PyObject *self, Py_ssize_t index)
11827 {
11828 const void *data;
11829 enum PyUnicode_Kind kind;
11830 Py_UCS4 ch;
11831
11832 if (!PyUnicode_Check(self)) {
11833 PyErr_BadArgument();
11834 return NULL;
11835 }
11836 if (PyUnicode_READY(self) == -1) {
11837 return NULL;
11838 }
11839 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11840 PyErr_SetString(PyExc_IndexError, "string index out of range");
11841 return NULL;
11842 }
11843 kind = PyUnicode_KIND(self);
11844 data = PyUnicode_DATA(self);
11845 ch = PyUnicode_READ(kind, data, index);
11846 return unicode_char(ch);
11847 }
11848
11849 /* Believe it or not, this produces the same value for ASCII strings
11850 as bytes_hash(). */
11851 static Py_hash_t
unicode_hash(PyObject * self)11852 unicode_hash(PyObject *self)
11853 {
11854 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11855
11856 #ifdef Py_DEBUG
11857 assert(_Py_HashSecret_Initialized);
11858 #endif
11859 if (_PyUnicode_HASH(self) != -1)
11860 return _PyUnicode_HASH(self);
11861 if (PyUnicode_READY(self) == -1)
11862 return -1;
11863
11864 x = _Py_HashBytes(PyUnicode_DATA(self),
11865 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11866 _PyUnicode_HASH(self) = x;
11867 return x;
11868 }
11869
11870 PyDoc_STRVAR(index__doc__,
11871 "S.index(sub[, start[, end]]) -> int\n\
11872 \n\
11873 Return the lowest index in S where substring sub is found,\n\
11874 such that sub is contained within S[start:end]. Optional\n\
11875 arguments start and end are interpreted as in slice notation.\n\
11876 \n\
11877 Raises ValueError when the substring is not found.");
11878
11879 static PyObject *
unicode_index(PyObject * self,PyObject * args)11880 unicode_index(PyObject *self, PyObject *args)
11881 {
11882 /* initialize variables to prevent gcc warning */
11883 Py_ssize_t result;
11884 PyObject *substring = NULL;
11885 Py_ssize_t start = 0;
11886 Py_ssize_t end = 0;
11887
11888 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11889 return NULL;
11890
11891 if (PyUnicode_READY(self) == -1)
11892 return NULL;
11893
11894 result = any_find_slice(self, substring, start, end, 1);
11895
11896 if (result == -2)
11897 return NULL;
11898
11899 if (result < 0) {
11900 PyErr_SetString(PyExc_ValueError, "substring not found");
11901 return NULL;
11902 }
11903
11904 return PyLong_FromSsize_t(result);
11905 }
11906
11907 /*[clinic input]
11908 str.isascii as unicode_isascii
11909
11910 Return True if all characters in the string are ASCII, False otherwise.
11911
11912 ASCII characters have code points in the range U+0000-U+007F.
11913 Empty string is ASCII too.
11914 [clinic start generated code]*/
11915
11916 static PyObject *
unicode_isascii_impl(PyObject * self)11917 unicode_isascii_impl(PyObject *self)
11918 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11919 {
11920 if (PyUnicode_READY(self) == -1) {
11921 return NULL;
11922 }
11923 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11924 }
11925
11926 /*[clinic input]
11927 str.islower as unicode_islower
11928
11929 Return True if the string is a lowercase string, False otherwise.
11930
11931 A string is lowercase if all cased characters in the string are lowercase and
11932 there is at least one cased character in the string.
11933 [clinic start generated code]*/
11934
11935 static PyObject *
unicode_islower_impl(PyObject * self)11936 unicode_islower_impl(PyObject *self)
11937 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11938 {
11939 Py_ssize_t i, length;
11940 int kind;
11941 const void *data;
11942 int cased;
11943
11944 if (PyUnicode_READY(self) == -1)
11945 return NULL;
11946 length = PyUnicode_GET_LENGTH(self);
11947 kind = PyUnicode_KIND(self);
11948 data = PyUnicode_DATA(self);
11949
11950 /* Shortcut for single character strings */
11951 if (length == 1)
11952 return PyBool_FromLong(
11953 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11954
11955 /* Special case for empty strings */
11956 if (length == 0)
11957 Py_RETURN_FALSE;
11958
11959 cased = 0;
11960 for (i = 0; i < length; i++) {
11961 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11962
11963 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11964 Py_RETURN_FALSE;
11965 else if (!cased && Py_UNICODE_ISLOWER(ch))
11966 cased = 1;
11967 }
11968 return PyBool_FromLong(cased);
11969 }
11970
11971 /*[clinic input]
11972 str.isupper as unicode_isupper
11973
11974 Return True if the string is an uppercase string, False otherwise.
11975
11976 A string is uppercase if all cased characters in the string are uppercase and
11977 there is at least one cased character in the string.
11978 [clinic start generated code]*/
11979
11980 static PyObject *
unicode_isupper_impl(PyObject * self)11981 unicode_isupper_impl(PyObject *self)
11982 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11983 {
11984 Py_ssize_t i, length;
11985 int kind;
11986 const void *data;
11987 int cased;
11988
11989 if (PyUnicode_READY(self) == -1)
11990 return NULL;
11991 length = PyUnicode_GET_LENGTH(self);
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
11994
11995 /* Shortcut for single character strings */
11996 if (length == 1)
11997 return PyBool_FromLong(
11998 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11999
12000 /* Special case for empty strings */
12001 if (length == 0)
12002 Py_RETURN_FALSE;
12003
12004 cased = 0;
12005 for (i = 0; i < length; i++) {
12006 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12007
12008 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12009 Py_RETURN_FALSE;
12010 else if (!cased && Py_UNICODE_ISUPPER(ch))
12011 cased = 1;
12012 }
12013 return PyBool_FromLong(cased);
12014 }
12015
12016 /*[clinic input]
12017 str.istitle as unicode_istitle
12018
12019 Return True if the string is a title-cased string, False otherwise.
12020
12021 In a title-cased string, upper- and title-case characters may only
12022 follow uncased characters and lowercase characters only cased ones.
12023 [clinic start generated code]*/
12024
12025 static PyObject *
unicode_istitle_impl(PyObject * self)12026 unicode_istitle_impl(PyObject *self)
12027 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12028 {
12029 Py_ssize_t i, length;
12030 int kind;
12031 const void *data;
12032 int cased, previous_is_cased;
12033
12034 if (PyUnicode_READY(self) == -1)
12035 return NULL;
12036 length = PyUnicode_GET_LENGTH(self);
12037 kind = PyUnicode_KIND(self);
12038 data = PyUnicode_DATA(self);
12039
12040 /* Shortcut for single character strings */
12041 if (length == 1) {
12042 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12043 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12044 (Py_UNICODE_ISUPPER(ch) != 0));
12045 }
12046
12047 /* Special case for empty strings */
12048 if (length == 0)
12049 Py_RETURN_FALSE;
12050
12051 cased = 0;
12052 previous_is_cased = 0;
12053 for (i = 0; i < length; i++) {
12054 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12055
12056 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12057 if (previous_is_cased)
12058 Py_RETURN_FALSE;
12059 previous_is_cased = 1;
12060 cased = 1;
12061 }
12062 else if (Py_UNICODE_ISLOWER(ch)) {
12063 if (!previous_is_cased)
12064 Py_RETURN_FALSE;
12065 previous_is_cased = 1;
12066 cased = 1;
12067 }
12068 else
12069 previous_is_cased = 0;
12070 }
12071 return PyBool_FromLong(cased);
12072 }
12073
12074 /*[clinic input]
12075 str.isspace as unicode_isspace
12076
12077 Return True if the string is a whitespace string, False otherwise.
12078
12079 A string is whitespace if all characters in the string are whitespace and there
12080 is at least one character in the string.
12081 [clinic start generated code]*/
12082
12083 static PyObject *
unicode_isspace_impl(PyObject * self)12084 unicode_isspace_impl(PyObject *self)
12085 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12086 {
12087 Py_ssize_t i, length;
12088 int kind;
12089 const void *data;
12090
12091 if (PyUnicode_READY(self) == -1)
12092 return NULL;
12093 length = PyUnicode_GET_LENGTH(self);
12094 kind = PyUnicode_KIND(self);
12095 data = PyUnicode_DATA(self);
12096
12097 /* Shortcut for single character strings */
12098 if (length == 1)
12099 return PyBool_FromLong(
12100 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12101
12102 /* Special case for empty strings */
12103 if (length == 0)
12104 Py_RETURN_FALSE;
12105
12106 for (i = 0; i < length; i++) {
12107 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12108 if (!Py_UNICODE_ISSPACE(ch))
12109 Py_RETURN_FALSE;
12110 }
12111 Py_RETURN_TRUE;
12112 }
12113
12114 /*[clinic input]
12115 str.isalpha as unicode_isalpha
12116
12117 Return True if the string is an alphabetic string, False otherwise.
12118
12119 A string is alphabetic if all characters in the string are alphabetic and there
12120 is at least one character in the string.
12121 [clinic start generated code]*/
12122
12123 static PyObject *
unicode_isalpha_impl(PyObject * self)12124 unicode_isalpha_impl(PyObject *self)
12125 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12126 {
12127 Py_ssize_t i, length;
12128 int kind;
12129 const void *data;
12130
12131 if (PyUnicode_READY(self) == -1)
12132 return NULL;
12133 length = PyUnicode_GET_LENGTH(self);
12134 kind = PyUnicode_KIND(self);
12135 data = PyUnicode_DATA(self);
12136
12137 /* Shortcut for single character strings */
12138 if (length == 1)
12139 return PyBool_FromLong(
12140 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12141
12142 /* Special case for empty strings */
12143 if (length == 0)
12144 Py_RETURN_FALSE;
12145
12146 for (i = 0; i < length; i++) {
12147 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12148 Py_RETURN_FALSE;
12149 }
12150 Py_RETURN_TRUE;
12151 }
12152
12153 /*[clinic input]
12154 str.isalnum as unicode_isalnum
12155
12156 Return True if the string is an alpha-numeric string, False otherwise.
12157
12158 A string is alpha-numeric if all characters in the string are alpha-numeric and
12159 there is at least one character in the string.
12160 [clinic start generated code]*/
12161
12162 static PyObject *
unicode_isalnum_impl(PyObject * self)12163 unicode_isalnum_impl(PyObject *self)
12164 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12165 {
12166 int kind;
12167 const void *data;
12168 Py_ssize_t len, i;
12169
12170 if (PyUnicode_READY(self) == -1)
12171 return NULL;
12172
12173 kind = PyUnicode_KIND(self);
12174 data = PyUnicode_DATA(self);
12175 len = PyUnicode_GET_LENGTH(self);
12176
12177 /* Shortcut for single character strings */
12178 if (len == 1) {
12179 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12180 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12181 }
12182
12183 /* Special case for empty strings */
12184 if (len == 0)
12185 Py_RETURN_FALSE;
12186
12187 for (i = 0; i < len; i++) {
12188 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12189 if (!Py_UNICODE_ISALNUM(ch))
12190 Py_RETURN_FALSE;
12191 }
12192 Py_RETURN_TRUE;
12193 }
12194
12195 /*[clinic input]
12196 str.isdecimal as unicode_isdecimal
12197
12198 Return True if the string is a decimal string, False otherwise.
12199
12200 A string is a decimal string if all characters in the string are decimal and
12201 there is at least one character in the string.
12202 [clinic start generated code]*/
12203
12204 static PyObject *
unicode_isdecimal_impl(PyObject * self)12205 unicode_isdecimal_impl(PyObject *self)
12206 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12207 {
12208 Py_ssize_t i, length;
12209 int kind;
12210 const void *data;
12211
12212 if (PyUnicode_READY(self) == -1)
12213 return NULL;
12214 length = PyUnicode_GET_LENGTH(self);
12215 kind = PyUnicode_KIND(self);
12216 data = PyUnicode_DATA(self);
12217
12218 /* Shortcut for single character strings */
12219 if (length == 1)
12220 return PyBool_FromLong(
12221 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12222
12223 /* Special case for empty strings */
12224 if (length == 0)
12225 Py_RETURN_FALSE;
12226
12227 for (i = 0; i < length; i++) {
12228 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12229 Py_RETURN_FALSE;
12230 }
12231 Py_RETURN_TRUE;
12232 }
12233
12234 /*[clinic input]
12235 str.isdigit as unicode_isdigit
12236
12237 Return True if the string is a digit string, False otherwise.
12238
12239 A string is a digit string if all characters in the string are digits and there
12240 is at least one character in the string.
12241 [clinic start generated code]*/
12242
12243 static PyObject *
unicode_isdigit_impl(PyObject * self)12244 unicode_isdigit_impl(PyObject *self)
12245 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12246 {
12247 Py_ssize_t i, length;
12248 int kind;
12249 const void *data;
12250
12251 if (PyUnicode_READY(self) == -1)
12252 return NULL;
12253 length = PyUnicode_GET_LENGTH(self);
12254 kind = PyUnicode_KIND(self);
12255 data = PyUnicode_DATA(self);
12256
12257 /* Shortcut for single character strings */
12258 if (length == 1) {
12259 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12260 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12261 }
12262
12263 /* Special case for empty strings */
12264 if (length == 0)
12265 Py_RETURN_FALSE;
12266
12267 for (i = 0; i < length; i++) {
12268 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12269 Py_RETURN_FALSE;
12270 }
12271 Py_RETURN_TRUE;
12272 }
12273
12274 /*[clinic input]
12275 str.isnumeric as unicode_isnumeric
12276
12277 Return True if the string is a numeric string, False otherwise.
12278
12279 A string is numeric if all characters in the string are numeric and there is at
12280 least one character in the string.
12281 [clinic start generated code]*/
12282
12283 static PyObject *
unicode_isnumeric_impl(PyObject * self)12284 unicode_isnumeric_impl(PyObject *self)
12285 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12286 {
12287 Py_ssize_t i, length;
12288 int kind;
12289 const void *data;
12290
12291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 length = PyUnicode_GET_LENGTH(self);
12294 kind = PyUnicode_KIND(self);
12295 data = PyUnicode_DATA(self);
12296
12297 /* Shortcut for single character strings */
12298 if (length == 1)
12299 return PyBool_FromLong(
12300 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12301
12302 /* Special case for empty strings */
12303 if (length == 0)
12304 Py_RETURN_FALSE;
12305
12306 for (i = 0; i < length; i++) {
12307 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12308 Py_RETURN_FALSE;
12309 }
12310 Py_RETURN_TRUE;
12311 }
12312
12313 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12314 _PyUnicode_ScanIdentifier(PyObject *self)
12315 {
12316 Py_ssize_t i;
12317 if (PyUnicode_READY(self) == -1)
12318 return -1;
12319
12320 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12321 if (len == 0) {
12322 /* an empty string is not a valid identifier */
12323 return 0;
12324 }
12325
12326 int kind = PyUnicode_KIND(self);
12327 const void *data = PyUnicode_DATA(self);
12328 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12329 /* PEP 3131 says that the first character must be in
12330 XID_Start and subsequent characters in XID_Continue,
12331 and for the ASCII range, the 2.x rules apply (i.e
12332 start with letters and underscore, continue with
12333 letters, digits, underscore). However, given the current
12334 definition of XID_Start and XID_Continue, it is sufficient
12335 to check just for these, except that _ must be allowed
12336 as starting an identifier. */
12337 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12338 return 0;
12339 }
12340
12341 for (i = 1; i < len; i++) {
12342 ch = PyUnicode_READ(kind, data, i);
12343 if (!_PyUnicode_IsXidContinue(ch)) {
12344 return i;
12345 }
12346 }
12347 return i;
12348 }
12349
12350 int
PyUnicode_IsIdentifier(PyObject * self)12351 PyUnicode_IsIdentifier(PyObject *self)
12352 {
12353 if (PyUnicode_IS_READY(self)) {
12354 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12355 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12356 /* an empty string is not a valid identifier */
12357 return len && i == len;
12358 }
12359 else {
12360 _Py_COMP_DIAG_PUSH
12361 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12362 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12363 if (len == 0) {
12364 /* an empty string is not a valid identifier */
12365 return 0;
12366 }
12367
12368 const wchar_t *wstr = _PyUnicode_WSTR(self);
12369 Py_UCS4 ch = wstr[i++];
12370 #if SIZEOF_WCHAR_T == 2
12371 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12372 && i < len
12373 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12374 {
12375 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12376 i++;
12377 }
12378 #endif
12379 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12380 return 0;
12381 }
12382
12383 while (i < len) {
12384 ch = wstr[i++];
12385 #if SIZEOF_WCHAR_T == 2
12386 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12387 && i < len
12388 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12389 {
12390 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12391 i++;
12392 }
12393 #endif
12394 if (!_PyUnicode_IsXidContinue(ch)) {
12395 return 0;
12396 }
12397 }
12398 return 1;
12399 _Py_COMP_DIAG_POP
12400 }
12401 }
12402
12403 /*[clinic input]
12404 str.isidentifier as unicode_isidentifier
12405
12406 Return True if the string is a valid Python identifier, False otherwise.
12407
12408 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12409 such as "def" or "class".
12410 [clinic start generated code]*/
12411
12412 static PyObject *
unicode_isidentifier_impl(PyObject * self)12413 unicode_isidentifier_impl(PyObject *self)
12414 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12415 {
12416 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12417 }
12418
12419 /*[clinic input]
12420 str.isprintable as unicode_isprintable
12421
12422 Return True if the string is printable, False otherwise.
12423
12424 A string is printable if all of its characters are considered printable in
12425 repr() or if it is empty.
12426 [clinic start generated code]*/
12427
12428 static PyObject *
unicode_isprintable_impl(PyObject * self)12429 unicode_isprintable_impl(PyObject *self)
12430 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12431 {
12432 Py_ssize_t i, length;
12433 int kind;
12434 const void *data;
12435
12436 if (PyUnicode_READY(self) == -1)
12437 return NULL;
12438 length = PyUnicode_GET_LENGTH(self);
12439 kind = PyUnicode_KIND(self);
12440 data = PyUnicode_DATA(self);
12441
12442 /* Shortcut for single character strings */
12443 if (length == 1)
12444 return PyBool_FromLong(
12445 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12446
12447 for (i = 0; i < length; i++) {
12448 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12449 Py_RETURN_FALSE;
12450 }
12451 }
12452 Py_RETURN_TRUE;
12453 }
12454
12455 /*[clinic input]
12456 str.join as unicode_join
12457
12458 iterable: object
12459 /
12460
12461 Concatenate any number of strings.
12462
12463 The string whose method is called is inserted in between each given string.
12464 The result is returned as a new string.
12465
12466 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12467 [clinic start generated code]*/
12468
12469 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12470 unicode_join(PyObject *self, PyObject *iterable)
12471 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12472 {
12473 return PyUnicode_Join(self, iterable);
12474 }
12475
12476 static Py_ssize_t
unicode_length(PyObject * self)12477 unicode_length(PyObject *self)
12478 {
12479 if (PyUnicode_READY(self) == -1)
12480 return -1;
12481 return PyUnicode_GET_LENGTH(self);
12482 }
12483
12484 /*[clinic input]
12485 str.ljust as unicode_ljust
12486
12487 width: Py_ssize_t
12488 fillchar: Py_UCS4 = ' '
12489 /
12490
12491 Return a left-justified string of length width.
12492
12493 Padding is done using the specified fill character (default is a space).
12494 [clinic start generated code]*/
12495
12496 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12497 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12498 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12499 {
12500 if (PyUnicode_READY(self) == -1)
12501 return NULL;
12502
12503 if (PyUnicode_GET_LENGTH(self) >= width)
12504 return unicode_result_unchanged(self);
12505
12506 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12507 }
12508
12509 /*[clinic input]
12510 str.lower as unicode_lower
12511
12512 Return a copy of the string converted to lowercase.
12513 [clinic start generated code]*/
12514
12515 static PyObject *
unicode_lower_impl(PyObject * self)12516 unicode_lower_impl(PyObject *self)
12517 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12518 {
12519 if (PyUnicode_READY(self) == -1)
12520 return NULL;
12521 if (PyUnicode_IS_ASCII(self))
12522 return ascii_upper_or_lower(self, 1);
12523 return case_operation(self, do_lower);
12524 }
12525
12526 #define LEFTSTRIP 0
12527 #define RIGHTSTRIP 1
12528 #define BOTHSTRIP 2
12529
12530 /* Arrays indexed by above */
12531 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12532
12533 #define STRIPNAME(i) (stripfuncnames[i])
12534
12535 /* externally visible for str.strip(unicode) */
12536 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12537 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12538 {
12539 const void *data;
12540 int kind;
12541 Py_ssize_t i, j, len;
12542 BLOOM_MASK sepmask;
12543 Py_ssize_t seplen;
12544
12545 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12546 return NULL;
12547
12548 kind = PyUnicode_KIND(self);
12549 data = PyUnicode_DATA(self);
12550 len = PyUnicode_GET_LENGTH(self);
12551 seplen = PyUnicode_GET_LENGTH(sepobj);
12552 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12553 PyUnicode_DATA(sepobj),
12554 seplen);
12555
12556 i = 0;
12557 if (striptype != RIGHTSTRIP) {
12558 while (i < len) {
12559 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12560 if (!BLOOM(sepmask, ch))
12561 break;
12562 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12563 break;
12564 i++;
12565 }
12566 }
12567
12568 j = len;
12569 if (striptype != LEFTSTRIP) {
12570 j--;
12571 while (j >= i) {
12572 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12573 if (!BLOOM(sepmask, ch))
12574 break;
12575 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12576 break;
12577 j--;
12578 }
12579
12580 j++;
12581 }
12582
12583 return PyUnicode_Substring(self, i, j);
12584 }
12585
12586 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12587 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12588 {
12589 const unsigned char *data;
12590 int kind;
12591 Py_ssize_t length;
12592
12593 if (PyUnicode_READY(self) == -1)
12594 return NULL;
12595
12596 length = PyUnicode_GET_LENGTH(self);
12597 end = Py_MIN(end, length);
12598
12599 if (start == 0 && end == length)
12600 return unicode_result_unchanged(self);
12601
12602 if (start < 0 || end < 0) {
12603 PyErr_SetString(PyExc_IndexError, "string index out of range");
12604 return NULL;
12605 }
12606 if (start >= length || end < start)
12607 _Py_RETURN_UNICODE_EMPTY();
12608
12609 length = end - start;
12610 if (PyUnicode_IS_ASCII(self)) {
12611 data = PyUnicode_1BYTE_DATA(self);
12612 return _PyUnicode_FromASCII((const char*)(data + start), length);
12613 }
12614 else {
12615 kind = PyUnicode_KIND(self);
12616 data = PyUnicode_1BYTE_DATA(self);
12617 return PyUnicode_FromKindAndData(kind,
12618 data + kind * start,
12619 length);
12620 }
12621 }
12622
12623 static PyObject *
do_strip(PyObject * self,int striptype)12624 do_strip(PyObject *self, int striptype)
12625 {
12626 Py_ssize_t len, i, j;
12627
12628 if (PyUnicode_READY(self) == -1)
12629 return NULL;
12630
12631 len = PyUnicode_GET_LENGTH(self);
12632
12633 if (PyUnicode_IS_ASCII(self)) {
12634 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12635
12636 i = 0;
12637 if (striptype != RIGHTSTRIP) {
12638 while (i < len) {
12639 Py_UCS1 ch = data[i];
12640 if (!_Py_ascii_whitespace[ch])
12641 break;
12642 i++;
12643 }
12644 }
12645
12646 j = len;
12647 if (striptype != LEFTSTRIP) {
12648 j--;
12649 while (j >= i) {
12650 Py_UCS1 ch = data[j];
12651 if (!_Py_ascii_whitespace[ch])
12652 break;
12653 j--;
12654 }
12655 j++;
12656 }
12657 }
12658 else {
12659 int kind = PyUnicode_KIND(self);
12660 const void *data = PyUnicode_DATA(self);
12661
12662 i = 0;
12663 if (striptype != RIGHTSTRIP) {
12664 while (i < len) {
12665 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12666 if (!Py_UNICODE_ISSPACE(ch))
12667 break;
12668 i++;
12669 }
12670 }
12671
12672 j = len;
12673 if (striptype != LEFTSTRIP) {
12674 j--;
12675 while (j >= i) {
12676 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12677 if (!Py_UNICODE_ISSPACE(ch))
12678 break;
12679 j--;
12680 }
12681 j++;
12682 }
12683 }
12684
12685 return PyUnicode_Substring(self, i, j);
12686 }
12687
12688
12689 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12690 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12691 {
12692 if (sep != Py_None) {
12693 if (PyUnicode_Check(sep))
12694 return _PyUnicode_XStrip(self, striptype, sep);
12695 else {
12696 PyErr_Format(PyExc_TypeError,
12697 "%s arg must be None or str",
12698 STRIPNAME(striptype));
12699 return NULL;
12700 }
12701 }
12702
12703 return do_strip(self, striptype);
12704 }
12705
12706
12707 /*[clinic input]
12708 str.strip as unicode_strip
12709
12710 chars: object = None
12711 /
12712
12713 Return a copy of the string with leading and trailing whitespace removed.
12714
12715 If chars is given and not None, remove characters in chars instead.
12716 [clinic start generated code]*/
12717
12718 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12719 unicode_strip_impl(PyObject *self, PyObject *chars)
12720 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12721 {
12722 return do_argstrip(self, BOTHSTRIP, chars);
12723 }
12724
12725
12726 /*[clinic input]
12727 str.lstrip as unicode_lstrip
12728
12729 chars: object = None
12730 /
12731
12732 Return a copy of the string with leading whitespace removed.
12733
12734 If chars is given and not None, remove characters in chars instead.
12735 [clinic start generated code]*/
12736
12737 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12738 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12739 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12740 {
12741 return do_argstrip(self, LEFTSTRIP, chars);
12742 }
12743
12744
12745 /*[clinic input]
12746 str.rstrip as unicode_rstrip
12747
12748 chars: object = None
12749 /
12750
12751 Return a copy of the string with trailing whitespace removed.
12752
12753 If chars is given and not None, remove characters in chars instead.
12754 [clinic start generated code]*/
12755
12756 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12757 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12758 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12759 {
12760 return do_argstrip(self, RIGHTSTRIP, chars);
12761 }
12762
12763
12764 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12765 unicode_repeat(PyObject *str, Py_ssize_t len)
12766 {
12767 PyObject *u;
12768 Py_ssize_t nchars, n;
12769
12770 if (len < 1)
12771 _Py_RETURN_UNICODE_EMPTY();
12772
12773 /* no repeat, return original string */
12774 if (len == 1)
12775 return unicode_result_unchanged(str);
12776
12777 if (PyUnicode_READY(str) == -1)
12778 return NULL;
12779
12780 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12781 PyErr_SetString(PyExc_OverflowError,
12782 "repeated string is too long");
12783 return NULL;
12784 }
12785 nchars = len * PyUnicode_GET_LENGTH(str);
12786
12787 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12788 if (!u)
12789 return NULL;
12790 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12791
12792 if (PyUnicode_GET_LENGTH(str) == 1) {
12793 int kind = PyUnicode_KIND(str);
12794 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12795 if (kind == PyUnicode_1BYTE_KIND) {
12796 void *to = PyUnicode_DATA(u);
12797 memset(to, (unsigned char)fill_char, len);
12798 }
12799 else if (kind == PyUnicode_2BYTE_KIND) {
12800 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12801 for (n = 0; n < len; ++n)
12802 ucs2[n] = fill_char;
12803 } else {
12804 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12805 assert(kind == PyUnicode_4BYTE_KIND);
12806 for (n = 0; n < len; ++n)
12807 ucs4[n] = fill_char;
12808 }
12809 }
12810 else {
12811 Py_ssize_t char_size = PyUnicode_KIND(str);
12812 char *to = (char *) PyUnicode_DATA(u);
12813 _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12814 PyUnicode_GET_LENGTH(str) * char_size);
12815 }
12816
12817 assert(_PyUnicode_CheckConsistency(u, 1));
12818 return u;
12819 }
12820
12821 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12822 PyUnicode_Replace(PyObject *str,
12823 PyObject *substr,
12824 PyObject *replstr,
12825 Py_ssize_t maxcount)
12826 {
12827 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12828 ensure_unicode(replstr) < 0)
12829 return NULL;
12830 return replace(str, substr, replstr, maxcount);
12831 }
12832
12833 /*[clinic input]
12834 str.replace as unicode_replace
12835
12836 old: unicode
12837 new: unicode
12838 count: Py_ssize_t = -1
12839 Maximum number of occurrences to replace.
12840 -1 (the default value) means replace all occurrences.
12841 /
12842
12843 Return a copy with all occurrences of substring old replaced by new.
12844
12845 If the optional argument count is given, only the first count occurrences are
12846 replaced.
12847 [clinic start generated code]*/
12848
12849 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12850 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12851 Py_ssize_t count)
12852 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12853 {
12854 if (PyUnicode_READY(self) == -1)
12855 return NULL;
12856 return replace(self, old, new, count);
12857 }
12858
12859 /*[clinic input]
12860 str.removeprefix as unicode_removeprefix
12861
12862 prefix: unicode
12863 /
12864
12865 Return a str with the given prefix string removed if present.
12866
12867 If the string starts with the prefix string, return string[len(prefix):].
12868 Otherwise, return a copy of the original string.
12869 [clinic start generated code]*/
12870
12871 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12872 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12873 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12874 {
12875 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12876 if (match == -1) {
12877 return NULL;
12878 }
12879 if (match) {
12880 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12881 PyUnicode_GET_LENGTH(self));
12882 }
12883 return unicode_result_unchanged(self);
12884 }
12885
12886 /*[clinic input]
12887 str.removesuffix as unicode_removesuffix
12888
12889 suffix: unicode
12890 /
12891
12892 Return a str with the given suffix string removed if present.
12893
12894 If the string ends with the suffix string and that suffix is not empty,
12895 return string[:-len(suffix)]. Otherwise, return a copy of the original
12896 string.
12897 [clinic start generated code]*/
12898
12899 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12900 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12901 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12902 {
12903 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12904 if (match == -1) {
12905 return NULL;
12906 }
12907 if (match) {
12908 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12909 - PyUnicode_GET_LENGTH(suffix));
12910 }
12911 return unicode_result_unchanged(self);
12912 }
12913
12914 static PyObject *
unicode_repr(PyObject * unicode)12915 unicode_repr(PyObject *unicode)
12916 {
12917 PyObject *repr;
12918 Py_ssize_t isize;
12919 Py_ssize_t osize, squote, dquote, i, o;
12920 Py_UCS4 max, quote;
12921 int ikind, okind, unchanged;
12922 const void *idata;
12923 void *odata;
12924
12925 if (PyUnicode_READY(unicode) == -1)
12926 return NULL;
12927
12928 isize = PyUnicode_GET_LENGTH(unicode);
12929 idata = PyUnicode_DATA(unicode);
12930
12931 /* Compute length of output, quote characters, and
12932 maximum character */
12933 osize = 0;
12934 max = 127;
12935 squote = dquote = 0;
12936 ikind = PyUnicode_KIND(unicode);
12937 for (i = 0; i < isize; i++) {
12938 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12939 Py_ssize_t incr = 1;
12940 switch (ch) {
12941 case '\'': squote++; break;
12942 case '"': dquote++; break;
12943 case '\\': case '\t': case '\r': case '\n':
12944 incr = 2;
12945 break;
12946 default:
12947 /* Fast-path ASCII */
12948 if (ch < ' ' || ch == 0x7f)
12949 incr = 4; /* \xHH */
12950 else if (ch < 0x7f)
12951 ;
12952 else if (Py_UNICODE_ISPRINTABLE(ch))
12953 max = ch > max ? ch : max;
12954 else if (ch < 0x100)
12955 incr = 4; /* \xHH */
12956 else if (ch < 0x10000)
12957 incr = 6; /* \uHHHH */
12958 else
12959 incr = 10; /* \uHHHHHHHH */
12960 }
12961 if (osize > PY_SSIZE_T_MAX - incr) {
12962 PyErr_SetString(PyExc_OverflowError,
12963 "string is too long to generate repr");
12964 return NULL;
12965 }
12966 osize += incr;
12967 }
12968
12969 quote = '\'';
12970 unchanged = (osize == isize);
12971 if (squote) {
12972 unchanged = 0;
12973 if (dquote)
12974 /* Both squote and dquote present. Use squote,
12975 and escape them */
12976 osize += squote;
12977 else
12978 quote = '"';
12979 }
12980 osize += 2; /* quotes */
12981
12982 repr = PyUnicode_New(osize, max);
12983 if (repr == NULL)
12984 return NULL;
12985 okind = PyUnicode_KIND(repr);
12986 odata = PyUnicode_DATA(repr);
12987
12988 PyUnicode_WRITE(okind, odata, 0, quote);
12989 PyUnicode_WRITE(okind, odata, osize-1, quote);
12990 if (unchanged) {
12991 _PyUnicode_FastCopyCharacters(repr, 1,
12992 unicode, 0,
12993 isize);
12994 }
12995 else {
12996 for (i = 0, o = 1; i < isize; i++) {
12997 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12998
12999 /* Escape quotes and backslashes */
13000 if ((ch == quote) || (ch == '\\')) {
13001 PyUnicode_WRITE(okind, odata, o++, '\\');
13002 PyUnicode_WRITE(okind, odata, o++, ch);
13003 continue;
13004 }
13005
13006 /* Map special whitespace to '\t', \n', '\r' */
13007 if (ch == '\t') {
13008 PyUnicode_WRITE(okind, odata, o++, '\\');
13009 PyUnicode_WRITE(okind, odata, o++, 't');
13010 }
13011 else if (ch == '\n') {
13012 PyUnicode_WRITE(okind, odata, o++, '\\');
13013 PyUnicode_WRITE(okind, odata, o++, 'n');
13014 }
13015 else if (ch == '\r') {
13016 PyUnicode_WRITE(okind, odata, o++, '\\');
13017 PyUnicode_WRITE(okind, odata, o++, 'r');
13018 }
13019
13020 /* Map non-printable US ASCII to '\xhh' */
13021 else if (ch < ' ' || ch == 0x7F) {
13022 PyUnicode_WRITE(okind, odata, o++, '\\');
13023 PyUnicode_WRITE(okind, odata, o++, 'x');
13024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13026 }
13027
13028 /* Copy ASCII characters as-is */
13029 else if (ch < 0x7F) {
13030 PyUnicode_WRITE(okind, odata, o++, ch);
13031 }
13032
13033 /* Non-ASCII characters */
13034 else {
13035 /* Map Unicode whitespace and control characters
13036 (categories Z* and C* except ASCII space)
13037 */
13038 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13039 PyUnicode_WRITE(okind, odata, o++, '\\');
13040 /* Map 8-bit characters to '\xhh' */
13041 if (ch <= 0xff) {
13042 PyUnicode_WRITE(okind, odata, o++, 'x');
13043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13045 }
13046 /* Map 16-bit characters to '\uxxxx' */
13047 else if (ch <= 0xffff) {
13048 PyUnicode_WRITE(okind, odata, o++, 'u');
13049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13053 }
13054 /* Map 21-bit characters to '\U00xxxxxx' */
13055 else {
13056 PyUnicode_WRITE(okind, odata, o++, 'U');
13057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13065 }
13066 }
13067 /* Copy characters as-is */
13068 else {
13069 PyUnicode_WRITE(okind, odata, o++, ch);
13070 }
13071 }
13072 }
13073 }
13074 /* Closing quote already added at the beginning */
13075 assert(_PyUnicode_CheckConsistency(repr, 1));
13076 return repr;
13077 }
13078
13079 PyDoc_STRVAR(rfind__doc__,
13080 "S.rfind(sub[, start[, end]]) -> int\n\
13081 \n\
13082 Return the highest index in S where substring sub is found,\n\
13083 such that sub is contained within S[start:end]. Optional\n\
13084 arguments start and end are interpreted as in slice notation.\n\
13085 \n\
13086 Return -1 on failure.");
13087
13088 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13089 unicode_rfind(PyObject *self, PyObject *args)
13090 {
13091 /* initialize variables to prevent gcc warning */
13092 PyObject *substring = NULL;
13093 Py_ssize_t start = 0;
13094 Py_ssize_t end = 0;
13095 Py_ssize_t result;
13096
13097 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13098 return NULL;
13099
13100 if (PyUnicode_READY(self) == -1)
13101 return NULL;
13102
13103 result = any_find_slice(self, substring, start, end, -1);
13104
13105 if (result == -2)
13106 return NULL;
13107
13108 return PyLong_FromSsize_t(result);
13109 }
13110
13111 PyDoc_STRVAR(rindex__doc__,
13112 "S.rindex(sub[, start[, end]]) -> int\n\
13113 \n\
13114 Return the highest index in S where substring sub is found,\n\
13115 such that sub is contained within S[start:end]. Optional\n\
13116 arguments start and end are interpreted as in slice notation.\n\
13117 \n\
13118 Raises ValueError when the substring is not found.");
13119
13120 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13121 unicode_rindex(PyObject *self, PyObject *args)
13122 {
13123 /* initialize variables to prevent gcc warning */
13124 PyObject *substring = NULL;
13125 Py_ssize_t start = 0;
13126 Py_ssize_t end = 0;
13127 Py_ssize_t result;
13128
13129 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13130 return NULL;
13131
13132 if (PyUnicode_READY(self) == -1)
13133 return NULL;
13134
13135 result = any_find_slice(self, substring, start, end, -1);
13136
13137 if (result == -2)
13138 return NULL;
13139
13140 if (result < 0) {
13141 PyErr_SetString(PyExc_ValueError, "substring not found");
13142 return NULL;
13143 }
13144
13145 return PyLong_FromSsize_t(result);
13146 }
13147
13148 /*[clinic input]
13149 str.rjust as unicode_rjust
13150
13151 width: Py_ssize_t
13152 fillchar: Py_UCS4 = ' '
13153 /
13154
13155 Return a right-justified string of length width.
13156
13157 Padding is done using the specified fill character (default is a space).
13158 [clinic start generated code]*/
13159
13160 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13161 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13162 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13163 {
13164 if (PyUnicode_READY(self) == -1)
13165 return NULL;
13166
13167 if (PyUnicode_GET_LENGTH(self) >= width)
13168 return unicode_result_unchanged(self);
13169
13170 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13171 }
13172
13173 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13174 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13175 {
13176 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13177 return NULL;
13178
13179 return split(s, sep, maxsplit);
13180 }
13181
13182 /*[clinic input]
13183 str.split as unicode_split
13184
13185 sep: object = None
13186 The separator used to split the string.
13187
13188 When set to None (the default value), will split on any whitespace
13189 character (including \\n \\r \\t \\f and spaces) and will discard
13190 empty strings from the result.
13191 maxsplit: Py_ssize_t = -1
13192 Maximum number of splits (starting from the left).
13193 -1 (the default value) means no limit.
13194
13195 Return a list of the substrings in the string, using sep as the separator string.
13196
13197 Note, str.split() is mainly useful for data that has been intentionally
13198 delimited. With natural text that includes punctuation, consider using
13199 the regular expression module.
13200
13201 [clinic start generated code]*/
13202
13203 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13204 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13205 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13206 {
13207 if (sep == Py_None)
13208 return split(self, NULL, maxsplit);
13209 if (PyUnicode_Check(sep))
13210 return split(self, sep, maxsplit);
13211
13212 PyErr_Format(PyExc_TypeError,
13213 "must be str or None, not %.100s",
13214 Py_TYPE(sep)->tp_name);
13215 return NULL;
13216 }
13217
13218 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13219 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13220 {
13221 PyObject* out;
13222 int kind1, kind2;
13223 const void *buf1, *buf2;
13224 Py_ssize_t len1, len2;
13225
13226 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13227 return NULL;
13228
13229 kind1 = PyUnicode_KIND(str_obj);
13230 kind2 = PyUnicode_KIND(sep_obj);
13231 len1 = PyUnicode_GET_LENGTH(str_obj);
13232 len2 = PyUnicode_GET_LENGTH(sep_obj);
13233 if (kind1 < kind2 || len1 < len2) {
13234 PyObject *empty = unicode_get_empty(); // Borrowed reference
13235 return PyTuple_Pack(3, str_obj, empty, empty);
13236 }
13237 buf1 = PyUnicode_DATA(str_obj);
13238 buf2 = PyUnicode_DATA(sep_obj);
13239 if (kind2 != kind1) {
13240 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13241 if (!buf2)
13242 return NULL;
13243 }
13244
13245 switch (kind1) {
13246 case PyUnicode_1BYTE_KIND:
13247 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13248 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13249 else
13250 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13251 break;
13252 case PyUnicode_2BYTE_KIND:
13253 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13254 break;
13255 case PyUnicode_4BYTE_KIND:
13256 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13257 break;
13258 default:
13259 Py_UNREACHABLE();
13260 }
13261
13262 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13263 if (kind2 != kind1)
13264 PyMem_Free((void *)buf2);
13265
13266 return out;
13267 }
13268
13269
13270 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13271 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13272 {
13273 PyObject* out;
13274 int kind1, kind2;
13275 const void *buf1, *buf2;
13276 Py_ssize_t len1, len2;
13277
13278 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13279 return NULL;
13280
13281 kind1 = PyUnicode_KIND(str_obj);
13282 kind2 = PyUnicode_KIND(sep_obj);
13283 len1 = PyUnicode_GET_LENGTH(str_obj);
13284 len2 = PyUnicode_GET_LENGTH(sep_obj);
13285 if (kind1 < kind2 || len1 < len2) {
13286 PyObject *empty = unicode_get_empty(); // Borrowed reference
13287 return PyTuple_Pack(3, empty, empty, str_obj);
13288 }
13289 buf1 = PyUnicode_DATA(str_obj);
13290 buf2 = PyUnicode_DATA(sep_obj);
13291 if (kind2 != kind1) {
13292 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13293 if (!buf2)
13294 return NULL;
13295 }
13296
13297 switch (kind1) {
13298 case PyUnicode_1BYTE_KIND:
13299 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13300 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13301 else
13302 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13303 break;
13304 case PyUnicode_2BYTE_KIND:
13305 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13306 break;
13307 case PyUnicode_4BYTE_KIND:
13308 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13309 break;
13310 default:
13311 Py_UNREACHABLE();
13312 }
13313
13314 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13315 if (kind2 != kind1)
13316 PyMem_Free((void *)buf2);
13317
13318 return out;
13319 }
13320
13321 /*[clinic input]
13322 str.partition as unicode_partition
13323
13324 sep: object
13325 /
13326
13327 Partition the string into three parts using the given separator.
13328
13329 This will search for the separator in the string. If the separator is found,
13330 returns a 3-tuple containing the part before the separator, the separator
13331 itself, and the part after it.
13332
13333 If the separator is not found, returns a 3-tuple containing the original string
13334 and two empty strings.
13335 [clinic start generated code]*/
13336
13337 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13338 unicode_partition(PyObject *self, PyObject *sep)
13339 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13340 {
13341 return PyUnicode_Partition(self, sep);
13342 }
13343
13344 /*[clinic input]
13345 str.rpartition as unicode_rpartition = str.partition
13346
13347 Partition the string into three parts using the given separator.
13348
13349 This will search for the separator in the string, starting at the end. If
13350 the separator is found, returns a 3-tuple containing the part before the
13351 separator, the separator itself, and the part after it.
13352
13353 If the separator is not found, returns a 3-tuple containing two empty strings
13354 and the original string.
13355 [clinic start generated code]*/
13356
13357 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13358 unicode_rpartition(PyObject *self, PyObject *sep)
13359 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13360 {
13361 return PyUnicode_RPartition(self, sep);
13362 }
13363
13364 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13365 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13366 {
13367 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13368 return NULL;
13369
13370 return rsplit(s, sep, maxsplit);
13371 }
13372
13373 /*[clinic input]
13374 str.rsplit as unicode_rsplit = str.split
13375
13376 Return a list of the substrings in the string, using sep as the separator string.
13377
13378 Splitting starts at the end of the string and works to the front.
13379 [clinic start generated code]*/
13380
13381 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13382 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13383 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13384 {
13385 if (sep == Py_None)
13386 return rsplit(self, NULL, maxsplit);
13387 if (PyUnicode_Check(sep))
13388 return rsplit(self, sep, maxsplit);
13389
13390 PyErr_Format(PyExc_TypeError,
13391 "must be str or None, not %.100s",
13392 Py_TYPE(sep)->tp_name);
13393 return NULL;
13394 }
13395
13396 /*[clinic input]
13397 str.splitlines as unicode_splitlines
13398
13399 keepends: bool(accept={int}) = False
13400
13401 Return a list of the lines in the string, breaking at line boundaries.
13402
13403 Line breaks are not included in the resulting list unless keepends is given and
13404 true.
13405 [clinic start generated code]*/
13406
13407 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13408 unicode_splitlines_impl(PyObject *self, int keepends)
13409 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13410 {
13411 return PyUnicode_Splitlines(self, keepends);
13412 }
13413
13414 static
unicode_str(PyObject * self)13415 PyObject *unicode_str(PyObject *self)
13416 {
13417 return unicode_result_unchanged(self);
13418 }
13419
13420 /*[clinic input]
13421 str.swapcase as unicode_swapcase
13422
13423 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13424 [clinic start generated code]*/
13425
13426 static PyObject *
unicode_swapcase_impl(PyObject * self)13427 unicode_swapcase_impl(PyObject *self)
13428 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13429 {
13430 if (PyUnicode_READY(self) == -1)
13431 return NULL;
13432 return case_operation(self, do_swapcase);
13433 }
13434
13435 /*[clinic input]
13436
13437 @staticmethod
13438 str.maketrans as unicode_maketrans
13439
13440 x: object
13441
13442 y: unicode=NULL
13443
13444 z: unicode=NULL
13445
13446 /
13447
13448 Return a translation table usable for str.translate().
13449
13450 If there is only one argument, it must be a dictionary mapping Unicode
13451 ordinals (integers) or characters to Unicode ordinals, strings or None.
13452 Character keys will be then converted to ordinals.
13453 If there are two arguments, they must be strings of equal length, and
13454 in the resulting dictionary, each character in x will be mapped to the
13455 character at the same position in y. If there is a third argument, it
13456 must be a string, whose characters will be mapped to None in the result.
13457 [clinic start generated code]*/
13458
13459 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13460 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13461 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13462 {
13463 PyObject *new = NULL, *key, *value;
13464 Py_ssize_t i = 0;
13465 int res;
13466
13467 new = PyDict_New();
13468 if (!new)
13469 return NULL;
13470 if (y != NULL) {
13471 int x_kind, y_kind, z_kind;
13472 const void *x_data, *y_data, *z_data;
13473
13474 /* x must be a string too, of equal length */
13475 if (!PyUnicode_Check(x)) {
13476 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13477 "be a string if there is a second argument");
13478 goto err;
13479 }
13480 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13481 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13482 "arguments must have equal length");
13483 goto err;
13484 }
13485 /* create entries for translating chars in x to those in y */
13486 x_kind = PyUnicode_KIND(x);
13487 y_kind = PyUnicode_KIND(y);
13488 x_data = PyUnicode_DATA(x);
13489 y_data = PyUnicode_DATA(y);
13490 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13491 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13492 if (!key)
13493 goto err;
13494 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13495 if (!value) {
13496 Py_DECREF(key);
13497 goto err;
13498 }
13499 res = PyDict_SetItem(new, key, value);
13500 Py_DECREF(key);
13501 Py_DECREF(value);
13502 if (res < 0)
13503 goto err;
13504 }
13505 /* create entries for deleting chars in z */
13506 if (z != NULL) {
13507 z_kind = PyUnicode_KIND(z);
13508 z_data = PyUnicode_DATA(z);
13509 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13510 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13511 if (!key)
13512 goto err;
13513 res = PyDict_SetItem(new, key, Py_None);
13514 Py_DECREF(key);
13515 if (res < 0)
13516 goto err;
13517 }
13518 }
13519 } else {
13520 int kind;
13521 const void *data;
13522
13523 /* x must be a dict */
13524 if (!PyDict_CheckExact(x)) {
13525 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13526 "to maketrans it must be a dict");
13527 goto err;
13528 }
13529 /* copy entries into the new dict, converting string keys to int keys */
13530 while (PyDict_Next(x, &i, &key, &value)) {
13531 if (PyUnicode_Check(key)) {
13532 /* convert string keys to integer keys */
13533 PyObject *newkey;
13534 if (PyUnicode_GET_LENGTH(key) != 1) {
13535 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13536 "table must be of length 1");
13537 goto err;
13538 }
13539 kind = PyUnicode_KIND(key);
13540 data = PyUnicode_DATA(key);
13541 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13542 if (!newkey)
13543 goto err;
13544 res = PyDict_SetItem(new, newkey, value);
13545 Py_DECREF(newkey);
13546 if (res < 0)
13547 goto err;
13548 } else if (PyLong_Check(key)) {
13549 /* just keep integer keys */
13550 if (PyDict_SetItem(new, key, value) < 0)
13551 goto err;
13552 } else {
13553 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13554 "be strings or integers");
13555 goto err;
13556 }
13557 }
13558 }
13559 return new;
13560 err:
13561 Py_DECREF(new);
13562 return NULL;
13563 }
13564
13565 /*[clinic input]
13566 str.translate as unicode_translate
13567
13568 table: object
13569 Translation table, which must be a mapping of Unicode ordinals to
13570 Unicode ordinals, strings, or None.
13571 /
13572
13573 Replace each character in the string using the given translation table.
13574
13575 The table must implement lookup/indexing via __getitem__, for instance a
13576 dictionary or list. If this operation raises LookupError, the character is
13577 left untouched. Characters mapped to None are deleted.
13578 [clinic start generated code]*/
13579
13580 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13581 unicode_translate(PyObject *self, PyObject *table)
13582 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13583 {
13584 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13585 }
13586
13587 /*[clinic input]
13588 str.upper as unicode_upper
13589
13590 Return a copy of the string converted to uppercase.
13591 [clinic start generated code]*/
13592
13593 static PyObject *
unicode_upper_impl(PyObject * self)13594 unicode_upper_impl(PyObject *self)
13595 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13596 {
13597 if (PyUnicode_READY(self) == -1)
13598 return NULL;
13599 if (PyUnicode_IS_ASCII(self))
13600 return ascii_upper_or_lower(self, 0);
13601 return case_operation(self, do_upper);
13602 }
13603
13604 /*[clinic input]
13605 str.zfill as unicode_zfill
13606
13607 width: Py_ssize_t
13608 /
13609
13610 Pad a numeric string with zeros on the left, to fill a field of the given width.
13611
13612 The string is never truncated.
13613 [clinic start generated code]*/
13614
13615 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13616 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13617 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13618 {
13619 Py_ssize_t fill;
13620 PyObject *u;
13621 int kind;
13622 const void *data;
13623 Py_UCS4 chr;
13624
13625 if (PyUnicode_READY(self) == -1)
13626 return NULL;
13627
13628 if (PyUnicode_GET_LENGTH(self) >= width)
13629 return unicode_result_unchanged(self);
13630
13631 fill = width - PyUnicode_GET_LENGTH(self);
13632
13633 u = pad(self, fill, 0, '0');
13634
13635 if (u == NULL)
13636 return NULL;
13637
13638 kind = PyUnicode_KIND(u);
13639 data = PyUnicode_DATA(u);
13640 chr = PyUnicode_READ(kind, data, fill);
13641
13642 if (chr == '+' || chr == '-') {
13643 /* move sign to beginning of string */
13644 PyUnicode_WRITE(kind, data, 0, chr);
13645 PyUnicode_WRITE(kind, data, fill, '0');
13646 }
13647
13648 assert(_PyUnicode_CheckConsistency(u, 1));
13649 return u;
13650 }
13651
13652 PyDoc_STRVAR(startswith__doc__,
13653 "S.startswith(prefix[, start[, end]]) -> bool\n\
13654 \n\
13655 Return True if S starts with the specified prefix, False otherwise.\n\
13656 With optional start, test S beginning at that position.\n\
13657 With optional end, stop comparing S at that position.\n\
13658 prefix can also be a tuple of strings to try.");
13659
13660 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13661 unicode_startswith(PyObject *self,
13662 PyObject *args)
13663 {
13664 PyObject *subobj;
13665 PyObject *substring;
13666 Py_ssize_t start = 0;
13667 Py_ssize_t end = PY_SSIZE_T_MAX;
13668 int result;
13669
13670 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13671 return NULL;
13672 if (PyTuple_Check(subobj)) {
13673 Py_ssize_t i;
13674 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13675 substring = PyTuple_GET_ITEM(subobj, i);
13676 if (!PyUnicode_Check(substring)) {
13677 PyErr_Format(PyExc_TypeError,
13678 "tuple for startswith must only contain str, "
13679 "not %.100s",
13680 Py_TYPE(substring)->tp_name);
13681 return NULL;
13682 }
13683 result = tailmatch(self, substring, start, end, -1);
13684 if (result == -1)
13685 return NULL;
13686 if (result) {
13687 Py_RETURN_TRUE;
13688 }
13689 }
13690 /* nothing matched */
13691 Py_RETURN_FALSE;
13692 }
13693 if (!PyUnicode_Check(subobj)) {
13694 PyErr_Format(PyExc_TypeError,
13695 "startswith first arg must be str or "
13696 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13697 return NULL;
13698 }
13699 result = tailmatch(self, subobj, start, end, -1);
13700 if (result == -1)
13701 return NULL;
13702 return PyBool_FromLong(result);
13703 }
13704
13705
13706 PyDoc_STRVAR(endswith__doc__,
13707 "S.endswith(suffix[, start[, end]]) -> bool\n\
13708 \n\
13709 Return True if S ends with the specified suffix, False otherwise.\n\
13710 With optional start, test S beginning at that position.\n\
13711 With optional end, stop comparing S at that position.\n\
13712 suffix can also be a tuple of strings to try.");
13713
13714 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13715 unicode_endswith(PyObject *self,
13716 PyObject *args)
13717 {
13718 PyObject *subobj;
13719 PyObject *substring;
13720 Py_ssize_t start = 0;
13721 Py_ssize_t end = PY_SSIZE_T_MAX;
13722 int result;
13723
13724 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13725 return NULL;
13726 if (PyTuple_Check(subobj)) {
13727 Py_ssize_t i;
13728 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13729 substring = PyTuple_GET_ITEM(subobj, i);
13730 if (!PyUnicode_Check(substring)) {
13731 PyErr_Format(PyExc_TypeError,
13732 "tuple for endswith must only contain str, "
13733 "not %.100s",
13734 Py_TYPE(substring)->tp_name);
13735 return NULL;
13736 }
13737 result = tailmatch(self, substring, start, end, +1);
13738 if (result == -1)
13739 return NULL;
13740 if (result) {
13741 Py_RETURN_TRUE;
13742 }
13743 }
13744 Py_RETURN_FALSE;
13745 }
13746 if (!PyUnicode_Check(subobj)) {
13747 PyErr_Format(PyExc_TypeError,
13748 "endswith first arg must be str or "
13749 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13750 return NULL;
13751 }
13752 result = tailmatch(self, subobj, start, end, +1);
13753 if (result == -1)
13754 return NULL;
13755 return PyBool_FromLong(result);
13756 }
13757
13758 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13759 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13760 {
13761 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13762 writer->data = PyUnicode_DATA(writer->buffer);
13763
13764 if (!writer->readonly) {
13765 writer->kind = PyUnicode_KIND(writer->buffer);
13766 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13767 }
13768 else {
13769 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13770 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13771 writer->kind = PyUnicode_WCHAR_KIND;
13772 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13773
13774 /* Copy-on-write mode: set buffer size to 0 so
13775 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13776 * next write. */
13777 writer->size = 0;
13778 }
13779 }
13780
13781 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13782 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13783 {
13784 memset(writer, 0, sizeof(*writer));
13785
13786 /* ASCII is the bare minimum */
13787 writer->min_char = 127;
13788
13789 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13790 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13791 writer->kind = PyUnicode_WCHAR_KIND;
13792 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13793 }
13794
13795 // Initialize _PyUnicodeWriter with initial buffer
13796 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13797 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13798 {
13799 memset(writer, 0, sizeof(*writer));
13800 writer->buffer = buffer;
13801 _PyUnicodeWriter_Update(writer);
13802 writer->min_length = writer->size;
13803 }
13804
13805 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13806 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13807 Py_ssize_t length, Py_UCS4 maxchar)
13808 {
13809 Py_ssize_t newlen;
13810 PyObject *newbuffer;
13811
13812 assert(maxchar <= MAX_UNICODE);
13813
13814 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13815 assert((maxchar > writer->maxchar && length >= 0)
13816 || length > 0);
13817
13818 if (length > PY_SSIZE_T_MAX - writer->pos) {
13819 PyErr_NoMemory();
13820 return -1;
13821 }
13822 newlen = writer->pos + length;
13823
13824 maxchar = Py_MAX(maxchar, writer->min_char);
13825
13826 if (writer->buffer == NULL) {
13827 assert(!writer->readonly);
13828 if (writer->overallocate
13829 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13830 /* overallocate to limit the number of realloc() */
13831 newlen += newlen / OVERALLOCATE_FACTOR;
13832 }
13833 if (newlen < writer->min_length)
13834 newlen = writer->min_length;
13835
13836 writer->buffer = PyUnicode_New(newlen, maxchar);
13837 if (writer->buffer == NULL)
13838 return -1;
13839 }
13840 else if (newlen > writer->size) {
13841 if (writer->overallocate
13842 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13843 /* overallocate to limit the number of realloc() */
13844 newlen += newlen / OVERALLOCATE_FACTOR;
13845 }
13846 if (newlen < writer->min_length)
13847 newlen = writer->min_length;
13848
13849 if (maxchar > writer->maxchar || writer->readonly) {
13850 /* resize + widen */
13851 maxchar = Py_MAX(maxchar, writer->maxchar);
13852 newbuffer = PyUnicode_New(newlen, maxchar);
13853 if (newbuffer == NULL)
13854 return -1;
13855 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13856 writer->buffer, 0, writer->pos);
13857 Py_DECREF(writer->buffer);
13858 writer->readonly = 0;
13859 }
13860 else {
13861 newbuffer = resize_compact(writer->buffer, newlen);
13862 if (newbuffer == NULL)
13863 return -1;
13864 }
13865 writer->buffer = newbuffer;
13866 }
13867 else if (maxchar > writer->maxchar) {
13868 assert(!writer->readonly);
13869 newbuffer = PyUnicode_New(writer->size, maxchar);
13870 if (newbuffer == NULL)
13871 return -1;
13872 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13873 writer->buffer, 0, writer->pos);
13874 Py_SETREF(writer->buffer, newbuffer);
13875 }
13876 _PyUnicodeWriter_Update(writer);
13877 return 0;
13878
13879 #undef OVERALLOCATE_FACTOR
13880 }
13881
13882 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13883 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13884 enum PyUnicode_Kind kind)
13885 {
13886 Py_UCS4 maxchar;
13887
13888 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13889 assert(writer->kind < kind);
13890
13891 switch (kind)
13892 {
13893 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13894 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13895 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13896 default:
13897 Py_UNREACHABLE();
13898 }
13899
13900 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13901 }
13902
13903 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13904 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13905 {
13906 assert(ch <= MAX_UNICODE);
13907 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13908 return -1;
13909 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13910 writer->pos++;
13911 return 0;
13912 }
13913
13914 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13915 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13916 {
13917 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13918 }
13919
13920 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13921 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13922 {
13923 Py_UCS4 maxchar;
13924 Py_ssize_t len;
13925
13926 if (PyUnicode_READY(str) == -1)
13927 return -1;
13928 len = PyUnicode_GET_LENGTH(str);
13929 if (len == 0)
13930 return 0;
13931 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13932 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13933 if (writer->buffer == NULL && !writer->overallocate) {
13934 assert(_PyUnicode_CheckConsistency(str, 1));
13935 writer->readonly = 1;
13936 Py_INCREF(str);
13937 writer->buffer = str;
13938 _PyUnicodeWriter_Update(writer);
13939 writer->pos += len;
13940 return 0;
13941 }
13942 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13943 return -1;
13944 }
13945 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13946 str, 0, len);
13947 writer->pos += len;
13948 return 0;
13949 }
13950
13951 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13952 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13953 Py_ssize_t start, Py_ssize_t end)
13954 {
13955 Py_UCS4 maxchar;
13956 Py_ssize_t len;
13957
13958 if (PyUnicode_READY(str) == -1)
13959 return -1;
13960
13961 assert(0 <= start);
13962 assert(end <= PyUnicode_GET_LENGTH(str));
13963 assert(start <= end);
13964
13965 if (end == 0)
13966 return 0;
13967
13968 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13969 return _PyUnicodeWriter_WriteStr(writer, str);
13970
13971 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13972 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13973 else
13974 maxchar = writer->maxchar;
13975 len = end - start;
13976
13977 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13978 return -1;
13979
13980 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13981 str, start, len);
13982 writer->pos += len;
13983 return 0;
13984 }
13985
13986 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13987 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13988 const char *ascii, Py_ssize_t len)
13989 {
13990 if (len == -1)
13991 len = strlen(ascii);
13992
13993 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13994
13995 if (writer->buffer == NULL && !writer->overallocate) {
13996 PyObject *str;
13997
13998 str = _PyUnicode_FromASCII(ascii, len);
13999 if (str == NULL)
14000 return -1;
14001
14002 writer->readonly = 1;
14003 writer->buffer = str;
14004 _PyUnicodeWriter_Update(writer);
14005 writer->pos += len;
14006 return 0;
14007 }
14008
14009 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14010 return -1;
14011
14012 switch (writer->kind)
14013 {
14014 case PyUnicode_1BYTE_KIND:
14015 {
14016 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14017 Py_UCS1 *data = writer->data;
14018
14019 memcpy(data + writer->pos, str, len);
14020 break;
14021 }
14022 case PyUnicode_2BYTE_KIND:
14023 {
14024 _PyUnicode_CONVERT_BYTES(
14025 Py_UCS1, Py_UCS2,
14026 ascii, ascii + len,
14027 (Py_UCS2 *)writer->data + writer->pos);
14028 break;
14029 }
14030 case PyUnicode_4BYTE_KIND:
14031 {
14032 _PyUnicode_CONVERT_BYTES(
14033 Py_UCS1, Py_UCS4,
14034 ascii, ascii + len,
14035 (Py_UCS4 *)writer->data + writer->pos);
14036 break;
14037 }
14038 default:
14039 Py_UNREACHABLE();
14040 }
14041
14042 writer->pos += len;
14043 return 0;
14044 }
14045
14046 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14047 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14048 const char *str, Py_ssize_t len)
14049 {
14050 Py_UCS4 maxchar;
14051
14052 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14053 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14054 return -1;
14055 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14056 writer->pos += len;
14057 return 0;
14058 }
14059
14060 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14061 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14062 {
14063 PyObject *str;
14064
14065 if (writer->pos == 0) {
14066 Py_CLEAR(writer->buffer);
14067 _Py_RETURN_UNICODE_EMPTY();
14068 }
14069
14070 str = writer->buffer;
14071 writer->buffer = NULL;
14072
14073 if (writer->readonly) {
14074 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14075 return str;
14076 }
14077
14078 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14079 PyObject *str2;
14080 str2 = resize_compact(str, writer->pos);
14081 if (str2 == NULL) {
14082 Py_DECREF(str);
14083 return NULL;
14084 }
14085 str = str2;
14086 }
14087
14088 assert(_PyUnicode_CheckConsistency(str, 1));
14089 return unicode_result_ready(str);
14090 }
14091
14092 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14093 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14094 {
14095 Py_CLEAR(writer->buffer);
14096 }
14097
14098 #include "stringlib/unicode_format.h"
14099
14100 PyDoc_STRVAR(format__doc__,
14101 "S.format(*args, **kwargs) -> str\n\
14102 \n\
14103 Return a formatted version of S, using substitutions from args and kwargs.\n\
14104 The substitutions are identified by braces ('{' and '}').");
14105
14106 PyDoc_STRVAR(format_map__doc__,
14107 "S.format_map(mapping) -> str\n\
14108 \n\
14109 Return a formatted version of S, using substitutions from mapping.\n\
14110 The substitutions are identified by braces ('{' and '}').");
14111
14112 /*[clinic input]
14113 str.__format__ as unicode___format__
14114
14115 format_spec: unicode
14116 /
14117
14118 Return a formatted version of the string as described by format_spec.
14119 [clinic start generated code]*/
14120
14121 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14122 unicode___format___impl(PyObject *self, PyObject *format_spec)
14123 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14124 {
14125 _PyUnicodeWriter writer;
14126 int ret;
14127
14128 if (PyUnicode_READY(self) == -1)
14129 return NULL;
14130 _PyUnicodeWriter_Init(&writer);
14131 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14132 self, format_spec, 0,
14133 PyUnicode_GET_LENGTH(format_spec));
14134 if (ret == -1) {
14135 _PyUnicodeWriter_Dealloc(&writer);
14136 return NULL;
14137 }
14138 return _PyUnicodeWriter_Finish(&writer);
14139 }
14140
14141 /*[clinic input]
14142 str.__sizeof__ as unicode_sizeof
14143
14144 Return the size of the string in memory, in bytes.
14145 [clinic start generated code]*/
14146
14147 static PyObject *
unicode_sizeof_impl(PyObject * self)14148 unicode_sizeof_impl(PyObject *self)
14149 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14150 {
14151 Py_ssize_t size;
14152
14153 /* If it's a compact object, account for base structure +
14154 character data. */
14155 if (PyUnicode_IS_COMPACT_ASCII(self))
14156 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14157 else if (PyUnicode_IS_COMPACT(self))
14158 size = sizeof(PyCompactUnicodeObject) +
14159 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14160 else {
14161 /* If it is a two-block object, account for base object, and
14162 for character block if present. */
14163 size = sizeof(PyUnicodeObject);
14164 if (_PyUnicode_DATA_ANY(self))
14165 size += (PyUnicode_GET_LENGTH(self) + 1) *
14166 PyUnicode_KIND(self);
14167 }
14168 /* If the wstr pointer is present, account for it unless it is shared
14169 with the data pointer. Check if the data is not shared. */
14170 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14171 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14172 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14173 size += PyUnicode_UTF8_LENGTH(self) + 1;
14174
14175 return PyLong_FromSsize_t(size);
14176 }
14177
14178 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14179 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14180 {
14181 PyObject *copy = _PyUnicode_Copy(v);
14182 if (!copy)
14183 return NULL;
14184 return Py_BuildValue("(N)", copy);
14185 }
14186
14187 static PyMethodDef unicode_methods[] = {
14188 UNICODE_ENCODE_METHODDEF
14189 UNICODE_REPLACE_METHODDEF
14190 UNICODE_SPLIT_METHODDEF
14191 UNICODE_RSPLIT_METHODDEF
14192 UNICODE_JOIN_METHODDEF
14193 UNICODE_CAPITALIZE_METHODDEF
14194 UNICODE_CASEFOLD_METHODDEF
14195 UNICODE_TITLE_METHODDEF
14196 UNICODE_CENTER_METHODDEF
14197 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14198 UNICODE_EXPANDTABS_METHODDEF
14199 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14200 UNICODE_PARTITION_METHODDEF
14201 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14202 UNICODE_LJUST_METHODDEF
14203 UNICODE_LOWER_METHODDEF
14204 UNICODE_LSTRIP_METHODDEF
14205 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14206 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14207 UNICODE_RJUST_METHODDEF
14208 UNICODE_RSTRIP_METHODDEF
14209 UNICODE_RPARTITION_METHODDEF
14210 UNICODE_SPLITLINES_METHODDEF
14211 UNICODE_STRIP_METHODDEF
14212 UNICODE_SWAPCASE_METHODDEF
14213 UNICODE_TRANSLATE_METHODDEF
14214 UNICODE_UPPER_METHODDEF
14215 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14216 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14217 UNICODE_REMOVEPREFIX_METHODDEF
14218 UNICODE_REMOVESUFFIX_METHODDEF
14219 UNICODE_ISASCII_METHODDEF
14220 UNICODE_ISLOWER_METHODDEF
14221 UNICODE_ISUPPER_METHODDEF
14222 UNICODE_ISTITLE_METHODDEF
14223 UNICODE_ISSPACE_METHODDEF
14224 UNICODE_ISDECIMAL_METHODDEF
14225 UNICODE_ISDIGIT_METHODDEF
14226 UNICODE_ISNUMERIC_METHODDEF
14227 UNICODE_ISALPHA_METHODDEF
14228 UNICODE_ISALNUM_METHODDEF
14229 UNICODE_ISIDENTIFIER_METHODDEF
14230 UNICODE_ISPRINTABLE_METHODDEF
14231 UNICODE_ZFILL_METHODDEF
14232 {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14233 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14234 UNICODE___FORMAT___METHODDEF
14235 UNICODE_MAKETRANS_METHODDEF
14236 UNICODE_SIZEOF_METHODDEF
14237 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14238 {NULL, NULL}
14239 };
14240
14241 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14242 unicode_mod(PyObject *v, PyObject *w)
14243 {
14244 if (!PyUnicode_Check(v))
14245 Py_RETURN_NOTIMPLEMENTED;
14246 return PyUnicode_Format(v, w);
14247 }
14248
14249 static PyNumberMethods unicode_as_number = {
14250 0, /*nb_add*/
14251 0, /*nb_subtract*/
14252 0, /*nb_multiply*/
14253 unicode_mod, /*nb_remainder*/
14254 };
14255
14256 static PySequenceMethods unicode_as_sequence = {
14257 (lenfunc) unicode_length, /* sq_length */
14258 PyUnicode_Concat, /* sq_concat */
14259 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14260 (ssizeargfunc) unicode_getitem, /* sq_item */
14261 0, /* sq_slice */
14262 0, /* sq_ass_item */
14263 0, /* sq_ass_slice */
14264 PyUnicode_Contains, /* sq_contains */
14265 };
14266
14267 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14268 unicode_subscript(PyObject* self, PyObject* item)
14269 {
14270 if (PyUnicode_READY(self) == -1)
14271 return NULL;
14272
14273 if (_PyIndex_Check(item)) {
14274 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14275 if (i == -1 && PyErr_Occurred())
14276 return NULL;
14277 if (i < 0)
14278 i += PyUnicode_GET_LENGTH(self);
14279 return unicode_getitem(self, i);
14280 } else if (PySlice_Check(item)) {
14281 Py_ssize_t start, stop, step, slicelength, i;
14282 size_t cur;
14283 PyObject *result;
14284 const void *src_data;
14285 void *dest_data;
14286 int src_kind, dest_kind;
14287 Py_UCS4 ch, max_char, kind_limit;
14288
14289 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14290 return NULL;
14291 }
14292 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14293 &start, &stop, step);
14294
14295 if (slicelength <= 0) {
14296 _Py_RETURN_UNICODE_EMPTY();
14297 } else if (start == 0 && step == 1 &&
14298 slicelength == PyUnicode_GET_LENGTH(self)) {
14299 return unicode_result_unchanged(self);
14300 } else if (step == 1) {
14301 return PyUnicode_Substring(self,
14302 start, start + slicelength);
14303 }
14304 /* General case */
14305 src_kind = PyUnicode_KIND(self);
14306 src_data = PyUnicode_DATA(self);
14307 if (!PyUnicode_IS_ASCII(self)) {
14308 kind_limit = kind_maxchar_limit(src_kind);
14309 max_char = 0;
14310 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14311 ch = PyUnicode_READ(src_kind, src_data, cur);
14312 if (ch > max_char) {
14313 max_char = ch;
14314 if (max_char >= kind_limit)
14315 break;
14316 }
14317 }
14318 }
14319 else
14320 max_char = 127;
14321 result = PyUnicode_New(slicelength, max_char);
14322 if (result == NULL)
14323 return NULL;
14324 dest_kind = PyUnicode_KIND(result);
14325 dest_data = PyUnicode_DATA(result);
14326
14327 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14328 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14329 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14330 }
14331 assert(_PyUnicode_CheckConsistency(result, 1));
14332 return result;
14333 } else {
14334 PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14335 Py_TYPE(item)->tp_name);
14336 return NULL;
14337 }
14338 }
14339
14340 static PyMappingMethods unicode_as_mapping = {
14341 (lenfunc)unicode_length, /* mp_length */
14342 (binaryfunc)unicode_subscript, /* mp_subscript */
14343 (objobjargproc)0, /* mp_ass_subscript */
14344 };
14345
14346
14347 /* Helpers for PyUnicode_Format() */
14348
14349 struct unicode_formatter_t {
14350 PyObject *args;
14351 int args_owned;
14352 Py_ssize_t arglen, argidx;
14353 PyObject *dict;
14354
14355 enum PyUnicode_Kind fmtkind;
14356 Py_ssize_t fmtcnt, fmtpos;
14357 const void *fmtdata;
14358 PyObject *fmtstr;
14359
14360 _PyUnicodeWriter writer;
14361 };
14362
14363 struct unicode_format_arg_t {
14364 Py_UCS4 ch;
14365 int flags;
14366 Py_ssize_t width;
14367 int prec;
14368 int sign;
14369 };
14370
14371 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14372 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14373 {
14374 Py_ssize_t argidx = ctx->argidx;
14375
14376 if (argidx < ctx->arglen) {
14377 ctx->argidx++;
14378 if (ctx->arglen < 0)
14379 return ctx->args;
14380 else
14381 return PyTuple_GetItem(ctx->args, argidx);
14382 }
14383 PyErr_SetString(PyExc_TypeError,
14384 "not enough arguments for format string");
14385 return NULL;
14386 }
14387
14388 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14389
14390 /* Format a float into the writer if the writer is not NULL, or into *p_output
14391 otherwise.
14392
14393 Return 0 on success, raise an exception and return -1 on error. */
14394 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14395 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14396 PyObject **p_output,
14397 _PyUnicodeWriter *writer)
14398 {
14399 char *p;
14400 double x;
14401 Py_ssize_t len;
14402 int prec;
14403 int dtoa_flags = 0;
14404
14405 x = PyFloat_AsDouble(v);
14406 if (x == -1.0 && PyErr_Occurred())
14407 return -1;
14408
14409 prec = arg->prec;
14410 if (prec < 0)
14411 prec = 6;
14412
14413 if (arg->flags & F_ALT)
14414 dtoa_flags |= Py_DTSF_ALT;
14415 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14416 if (p == NULL)
14417 return -1;
14418 len = strlen(p);
14419 if (writer) {
14420 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14421 PyMem_Free(p);
14422 return -1;
14423 }
14424 }
14425 else
14426 *p_output = _PyUnicode_FromASCII(p, len);
14427 PyMem_Free(p);
14428 return 0;
14429 }
14430
14431 /* formatlong() emulates the format codes d, u, o, x and X, and
14432 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14433 * Python's regular ints.
14434 * Return value: a new PyUnicodeObject*, or NULL if error.
14435 * The output string is of the form
14436 * "-"? ("0x" | "0X")? digit+
14437 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14438 * set in flags. The case of hex digits will be correct,
14439 * There will be at least prec digits, zero-filled on the left if
14440 * necessary to get that many.
14441 * val object to be converted
14442 * flags bitmask of format flags; only F_ALT is looked at
14443 * prec minimum number of digits; 0-fill on left if needed
14444 * type a character in [duoxX]; u acts the same as d
14445 *
14446 * CAUTION: o, x and X conversions on regular ints can never
14447 * produce a '-' sign, but can for Python's unbounded ints.
14448 */
14449 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14450 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14451 {
14452 PyObject *result = NULL;
14453 char *buf;
14454 Py_ssize_t i;
14455 int sign; /* 1 if '-', else 0 */
14456 int len; /* number of characters */
14457 Py_ssize_t llen;
14458 int numdigits; /* len == numnondigits + numdigits */
14459 int numnondigits = 0;
14460
14461 /* Avoid exceeding SSIZE_T_MAX */
14462 if (prec > INT_MAX-3) {
14463 PyErr_SetString(PyExc_OverflowError,
14464 "precision too large");
14465 return NULL;
14466 }
14467
14468 assert(PyLong_Check(val));
14469
14470 switch (type) {
14471 default:
14472 Py_UNREACHABLE();
14473 case 'd':
14474 case 'i':
14475 case 'u':
14476 /* int and int subclasses should print numerically when a numeric */
14477 /* format code is used (see issue18780) */
14478 result = PyNumber_ToBase(val, 10);
14479 break;
14480 case 'o':
14481 numnondigits = 2;
14482 result = PyNumber_ToBase(val, 8);
14483 break;
14484 case 'x':
14485 case 'X':
14486 numnondigits = 2;
14487 result = PyNumber_ToBase(val, 16);
14488 break;
14489 }
14490 if (!result)
14491 return NULL;
14492
14493 assert(unicode_modifiable(result));
14494 assert(PyUnicode_IS_READY(result));
14495 assert(PyUnicode_IS_ASCII(result));
14496
14497 /* To modify the string in-place, there can only be one reference. */
14498 if (Py_REFCNT(result) != 1) {
14499 Py_DECREF(result);
14500 PyErr_BadInternalCall();
14501 return NULL;
14502 }
14503 buf = PyUnicode_DATA(result);
14504 llen = PyUnicode_GET_LENGTH(result);
14505 if (llen > INT_MAX) {
14506 Py_DECREF(result);
14507 PyErr_SetString(PyExc_ValueError,
14508 "string too large in _PyUnicode_FormatLong");
14509 return NULL;
14510 }
14511 len = (int)llen;
14512 sign = buf[0] == '-';
14513 numnondigits += sign;
14514 numdigits = len - numnondigits;
14515 assert(numdigits > 0);
14516
14517 /* Get rid of base marker unless F_ALT */
14518 if (((alt) == 0 &&
14519 (type == 'o' || type == 'x' || type == 'X'))) {
14520 assert(buf[sign] == '0');
14521 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14522 buf[sign+1] == 'o');
14523 numnondigits -= 2;
14524 buf += 2;
14525 len -= 2;
14526 if (sign)
14527 buf[0] = '-';
14528 assert(len == numnondigits + numdigits);
14529 assert(numdigits > 0);
14530 }
14531
14532 /* Fill with leading zeroes to meet minimum width. */
14533 if (prec > numdigits) {
14534 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14535 numnondigits + prec);
14536 char *b1;
14537 if (!r1) {
14538 Py_DECREF(result);
14539 return NULL;
14540 }
14541 b1 = PyBytes_AS_STRING(r1);
14542 for (i = 0; i < numnondigits; ++i)
14543 *b1++ = *buf++;
14544 for (i = 0; i < prec - numdigits; i++)
14545 *b1++ = '0';
14546 for (i = 0; i < numdigits; i++)
14547 *b1++ = *buf++;
14548 *b1 = '\0';
14549 Py_DECREF(result);
14550 result = r1;
14551 buf = PyBytes_AS_STRING(result);
14552 len = numnondigits + prec;
14553 }
14554
14555 /* Fix up case for hex conversions. */
14556 if (type == 'X') {
14557 /* Need to convert all lower case letters to upper case.
14558 and need to convert 0x to 0X (and -0x to -0X). */
14559 for (i = 0; i < len; i++)
14560 if (buf[i] >= 'a' && buf[i] <= 'x')
14561 buf[i] -= 'a'-'A';
14562 }
14563 if (!PyUnicode_Check(result)
14564 || buf != PyUnicode_DATA(result)) {
14565 PyObject *unicode;
14566 unicode = _PyUnicode_FromASCII(buf, len);
14567 Py_DECREF(result);
14568 result = unicode;
14569 }
14570 else if (len != PyUnicode_GET_LENGTH(result)) {
14571 if (PyUnicode_Resize(&result, len) < 0)
14572 Py_CLEAR(result);
14573 }
14574 return result;
14575 }
14576
14577 /* Format an integer or a float as an integer.
14578 * Return 1 if the number has been formatted into the writer,
14579 * 0 if the number has been formatted into *p_output
14580 * -1 and raise an exception on error */
14581 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14582 mainformatlong(PyObject *v,
14583 struct unicode_format_arg_t *arg,
14584 PyObject **p_output,
14585 _PyUnicodeWriter *writer)
14586 {
14587 PyObject *iobj, *res;
14588 char type = (char)arg->ch;
14589
14590 if (!PyNumber_Check(v))
14591 goto wrongtype;
14592
14593 /* make sure number is a type of integer for o, x, and X */
14594 if (!PyLong_Check(v)) {
14595 if (type == 'o' || type == 'x' || type == 'X') {
14596 iobj = _PyNumber_Index(v);
14597 }
14598 else {
14599 iobj = PyNumber_Long(v);
14600 }
14601 if (iobj == NULL ) {
14602 if (PyErr_ExceptionMatches(PyExc_TypeError))
14603 goto wrongtype;
14604 return -1;
14605 }
14606 assert(PyLong_Check(iobj));
14607 }
14608 else {
14609 iobj = v;
14610 Py_INCREF(iobj);
14611 }
14612
14613 if (PyLong_CheckExact(v)
14614 && arg->width == -1 && arg->prec == -1
14615 && !(arg->flags & (F_SIGN | F_BLANK))
14616 && type != 'X')
14617 {
14618 /* Fast path */
14619 int alternate = arg->flags & F_ALT;
14620 int base;
14621
14622 switch(type)
14623 {
14624 default:
14625 Py_UNREACHABLE();
14626 case 'd':
14627 case 'i':
14628 case 'u':
14629 base = 10;
14630 break;
14631 case 'o':
14632 base = 8;
14633 break;
14634 case 'x':
14635 case 'X':
14636 base = 16;
14637 break;
14638 }
14639
14640 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14641 Py_DECREF(iobj);
14642 return -1;
14643 }
14644 Py_DECREF(iobj);
14645 return 1;
14646 }
14647
14648 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14649 Py_DECREF(iobj);
14650 if (res == NULL)
14651 return -1;
14652 *p_output = res;
14653 return 0;
14654
14655 wrongtype:
14656 switch(type)
14657 {
14658 case 'o':
14659 case 'x':
14660 case 'X':
14661 PyErr_Format(PyExc_TypeError,
14662 "%%%c format: an integer is required, "
14663 "not %.200s",
14664 type, Py_TYPE(v)->tp_name);
14665 break;
14666 default:
14667 PyErr_Format(PyExc_TypeError,
14668 "%%%c format: a real number is required, "
14669 "not %.200s",
14670 type, Py_TYPE(v)->tp_name);
14671 break;
14672 }
14673 return -1;
14674 }
14675
14676 static Py_UCS4
formatchar(PyObject * v)14677 formatchar(PyObject *v)
14678 {
14679 /* presume that the buffer is at least 3 characters long */
14680 if (PyUnicode_Check(v)) {
14681 if (PyUnicode_GET_LENGTH(v) == 1) {
14682 return PyUnicode_READ_CHAR(v, 0);
14683 }
14684 goto onError;
14685 }
14686 else {
14687 int overflow;
14688 long x = PyLong_AsLongAndOverflow(v, &overflow);
14689 if (x == -1 && PyErr_Occurred()) {
14690 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14691 goto onError;
14692 }
14693 return (Py_UCS4) -1;
14694 }
14695
14696 if (x < 0 || x > MAX_UNICODE) {
14697 /* this includes an overflow in converting to C long */
14698 PyErr_SetString(PyExc_OverflowError,
14699 "%c arg not in range(0x110000)");
14700 return (Py_UCS4) -1;
14701 }
14702
14703 return (Py_UCS4) x;
14704 }
14705
14706 onError:
14707 PyErr_SetString(PyExc_TypeError,
14708 "%c requires int or char");
14709 return (Py_UCS4) -1;
14710 }
14711
14712 /* Parse options of an argument: flags, width, precision.
14713 Handle also "%(name)" syntax.
14714
14715 Return 0 if the argument has been formatted into arg->str.
14716 Return 1 if the argument has been written into ctx->writer,
14717 Raise an exception and return -1 on error. */
14718 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14719 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14720 struct unicode_format_arg_t *arg)
14721 {
14722 #define FORMAT_READ(ctx) \
14723 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14724
14725 PyObject *v;
14726
14727 if (arg->ch == '(') {
14728 /* Get argument value from a dictionary. Example: "%(name)s". */
14729 Py_ssize_t keystart;
14730 Py_ssize_t keylen;
14731 PyObject *key;
14732 int pcount = 1;
14733
14734 if (ctx->dict == NULL) {
14735 PyErr_SetString(PyExc_TypeError,
14736 "format requires a mapping");
14737 return -1;
14738 }
14739 ++ctx->fmtpos;
14740 --ctx->fmtcnt;
14741 keystart = ctx->fmtpos;
14742 /* Skip over balanced parentheses */
14743 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14744 arg->ch = FORMAT_READ(ctx);
14745 if (arg->ch == ')')
14746 --pcount;
14747 else if (arg->ch == '(')
14748 ++pcount;
14749 ctx->fmtpos++;
14750 }
14751 keylen = ctx->fmtpos - keystart - 1;
14752 if (ctx->fmtcnt < 0 || pcount > 0) {
14753 PyErr_SetString(PyExc_ValueError,
14754 "incomplete format key");
14755 return -1;
14756 }
14757 key = PyUnicode_Substring(ctx->fmtstr,
14758 keystart, keystart + keylen);
14759 if (key == NULL)
14760 return -1;
14761 if (ctx->args_owned) {
14762 ctx->args_owned = 0;
14763 Py_DECREF(ctx->args);
14764 }
14765 ctx->args = PyObject_GetItem(ctx->dict, key);
14766 Py_DECREF(key);
14767 if (ctx->args == NULL)
14768 return -1;
14769 ctx->args_owned = 1;
14770 ctx->arglen = -1;
14771 ctx->argidx = -2;
14772 }
14773
14774 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14775 while (--ctx->fmtcnt >= 0) {
14776 arg->ch = FORMAT_READ(ctx);
14777 ctx->fmtpos++;
14778 switch (arg->ch) {
14779 case '-': arg->flags |= F_LJUST; continue;
14780 case '+': arg->flags |= F_SIGN; continue;
14781 case ' ': arg->flags |= F_BLANK; continue;
14782 case '#': arg->flags |= F_ALT; continue;
14783 case '0': arg->flags |= F_ZERO; continue;
14784 }
14785 break;
14786 }
14787
14788 /* Parse width. Example: "%10s" => width=10 */
14789 if (arg->ch == '*') {
14790 v = unicode_format_getnextarg(ctx);
14791 if (v == NULL)
14792 return -1;
14793 if (!PyLong_Check(v)) {
14794 PyErr_SetString(PyExc_TypeError,
14795 "* wants int");
14796 return -1;
14797 }
14798 arg->width = PyLong_AsSsize_t(v);
14799 if (arg->width == -1 && PyErr_Occurred())
14800 return -1;
14801 if (arg->width < 0) {
14802 arg->flags |= F_LJUST;
14803 arg->width = -arg->width;
14804 }
14805 if (--ctx->fmtcnt >= 0) {
14806 arg->ch = FORMAT_READ(ctx);
14807 ctx->fmtpos++;
14808 }
14809 }
14810 else if (arg->ch >= '0' && arg->ch <= '9') {
14811 arg->width = arg->ch - '0';
14812 while (--ctx->fmtcnt >= 0) {
14813 arg->ch = FORMAT_READ(ctx);
14814 ctx->fmtpos++;
14815 if (arg->ch < '0' || arg->ch > '9')
14816 break;
14817 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14818 mixing signed and unsigned comparison. Since arg->ch is between
14819 '0' and '9', casting to int is safe. */
14820 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14821 PyErr_SetString(PyExc_ValueError,
14822 "width too big");
14823 return -1;
14824 }
14825 arg->width = arg->width*10 + (arg->ch - '0');
14826 }
14827 }
14828
14829 /* Parse precision. Example: "%.3f" => prec=3 */
14830 if (arg->ch == '.') {
14831 arg->prec = 0;
14832 if (--ctx->fmtcnt >= 0) {
14833 arg->ch = FORMAT_READ(ctx);
14834 ctx->fmtpos++;
14835 }
14836 if (arg->ch == '*') {
14837 v = unicode_format_getnextarg(ctx);
14838 if (v == NULL)
14839 return -1;
14840 if (!PyLong_Check(v)) {
14841 PyErr_SetString(PyExc_TypeError,
14842 "* wants int");
14843 return -1;
14844 }
14845 arg->prec = _PyLong_AsInt(v);
14846 if (arg->prec == -1 && PyErr_Occurred())
14847 return -1;
14848 if (arg->prec < 0)
14849 arg->prec = 0;
14850 if (--ctx->fmtcnt >= 0) {
14851 arg->ch = FORMAT_READ(ctx);
14852 ctx->fmtpos++;
14853 }
14854 }
14855 else if (arg->ch >= '0' && arg->ch <= '9') {
14856 arg->prec = arg->ch - '0';
14857 while (--ctx->fmtcnt >= 0) {
14858 arg->ch = FORMAT_READ(ctx);
14859 ctx->fmtpos++;
14860 if (arg->ch < '0' || arg->ch > '9')
14861 break;
14862 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14863 PyErr_SetString(PyExc_ValueError,
14864 "precision too big");
14865 return -1;
14866 }
14867 arg->prec = arg->prec*10 + (arg->ch - '0');
14868 }
14869 }
14870 }
14871
14872 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14873 if (ctx->fmtcnt >= 0) {
14874 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14875 if (--ctx->fmtcnt >= 0) {
14876 arg->ch = FORMAT_READ(ctx);
14877 ctx->fmtpos++;
14878 }
14879 }
14880 }
14881 if (ctx->fmtcnt < 0) {
14882 PyErr_SetString(PyExc_ValueError,
14883 "incomplete format");
14884 return -1;
14885 }
14886 return 0;
14887
14888 #undef FORMAT_READ
14889 }
14890
14891 /* Format one argument. Supported conversion specifiers:
14892
14893 - "s", "r", "a": any type
14894 - "i", "d", "u": int or float
14895 - "o", "x", "X": int
14896 - "e", "E", "f", "F", "g", "G": float
14897 - "c": int or str (1 character)
14898
14899 When possible, the output is written directly into the Unicode writer
14900 (ctx->writer). A string is created when padding is required.
14901
14902 Return 0 if the argument has been formatted into *p_str,
14903 1 if the argument has been written into ctx->writer,
14904 -1 on error. */
14905 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14906 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14907 struct unicode_format_arg_t *arg,
14908 PyObject **p_str)
14909 {
14910 PyObject *v;
14911 _PyUnicodeWriter *writer = &ctx->writer;
14912
14913 if (ctx->fmtcnt == 0)
14914 ctx->writer.overallocate = 0;
14915
14916 v = unicode_format_getnextarg(ctx);
14917 if (v == NULL)
14918 return -1;
14919
14920
14921 switch (arg->ch) {
14922 case 's':
14923 case 'r':
14924 case 'a':
14925 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14926 /* Fast path */
14927 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14928 return -1;
14929 return 1;
14930 }
14931
14932 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14933 *p_str = v;
14934 Py_INCREF(*p_str);
14935 }
14936 else {
14937 if (arg->ch == 's')
14938 *p_str = PyObject_Str(v);
14939 else if (arg->ch == 'r')
14940 *p_str = PyObject_Repr(v);
14941 else
14942 *p_str = PyObject_ASCII(v);
14943 }
14944 break;
14945
14946 case 'i':
14947 case 'd':
14948 case 'u':
14949 case 'o':
14950 case 'x':
14951 case 'X':
14952 {
14953 int ret = mainformatlong(v, arg, p_str, writer);
14954 if (ret != 0)
14955 return ret;
14956 arg->sign = 1;
14957 break;
14958 }
14959
14960 case 'e':
14961 case 'E':
14962 case 'f':
14963 case 'F':
14964 case 'g':
14965 case 'G':
14966 if (arg->width == -1 && arg->prec == -1
14967 && !(arg->flags & (F_SIGN | F_BLANK)))
14968 {
14969 /* Fast path */
14970 if (formatfloat(v, arg, NULL, writer) == -1)
14971 return -1;
14972 return 1;
14973 }
14974
14975 arg->sign = 1;
14976 if (formatfloat(v, arg, p_str, NULL) == -1)
14977 return -1;
14978 break;
14979
14980 case 'c':
14981 {
14982 Py_UCS4 ch = formatchar(v);
14983 if (ch == (Py_UCS4) -1)
14984 return -1;
14985 if (arg->width == -1 && arg->prec == -1) {
14986 /* Fast path */
14987 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14988 return -1;
14989 return 1;
14990 }
14991 *p_str = PyUnicode_FromOrdinal(ch);
14992 break;
14993 }
14994
14995 default:
14996 PyErr_Format(PyExc_ValueError,
14997 "unsupported format character '%c' (0x%x) "
14998 "at index %zd",
14999 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15000 (int)arg->ch,
15001 ctx->fmtpos - 1);
15002 return -1;
15003 }
15004 if (*p_str == NULL)
15005 return -1;
15006 assert (PyUnicode_Check(*p_str));
15007 return 0;
15008 }
15009
15010 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15011 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15012 struct unicode_format_arg_t *arg,
15013 PyObject *str)
15014 {
15015 Py_ssize_t len;
15016 enum PyUnicode_Kind kind;
15017 const void *pbuf;
15018 Py_ssize_t pindex;
15019 Py_UCS4 signchar;
15020 Py_ssize_t buflen;
15021 Py_UCS4 maxchar;
15022 Py_ssize_t sublen;
15023 _PyUnicodeWriter *writer = &ctx->writer;
15024 Py_UCS4 fill;
15025
15026 fill = ' ';
15027 if (arg->sign && arg->flags & F_ZERO)
15028 fill = '0';
15029
15030 if (PyUnicode_READY(str) == -1)
15031 return -1;
15032
15033 len = PyUnicode_GET_LENGTH(str);
15034 if ((arg->width == -1 || arg->width <= len)
15035 && (arg->prec == -1 || arg->prec >= len)
15036 && !(arg->flags & (F_SIGN | F_BLANK)))
15037 {
15038 /* Fast path */
15039 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15040 return -1;
15041 return 0;
15042 }
15043
15044 /* Truncate the string for "s", "r" and "a" formats
15045 if the precision is set */
15046 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15047 if (arg->prec >= 0 && len > arg->prec)
15048 len = arg->prec;
15049 }
15050
15051 /* Adjust sign and width */
15052 kind = PyUnicode_KIND(str);
15053 pbuf = PyUnicode_DATA(str);
15054 pindex = 0;
15055 signchar = '\0';
15056 if (arg->sign) {
15057 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15058 if (ch == '-' || ch == '+') {
15059 signchar = ch;
15060 len--;
15061 pindex++;
15062 }
15063 else if (arg->flags & F_SIGN)
15064 signchar = '+';
15065 else if (arg->flags & F_BLANK)
15066 signchar = ' ';
15067 else
15068 arg->sign = 0;
15069 }
15070 if (arg->width < len)
15071 arg->width = len;
15072
15073 /* Prepare the writer */
15074 maxchar = writer->maxchar;
15075 if (!(arg->flags & F_LJUST)) {
15076 if (arg->sign) {
15077 if ((arg->width-1) > len)
15078 maxchar = Py_MAX(maxchar, fill);
15079 }
15080 else {
15081 if (arg->width > len)
15082 maxchar = Py_MAX(maxchar, fill);
15083 }
15084 }
15085 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15086 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15087 maxchar = Py_MAX(maxchar, strmaxchar);
15088 }
15089
15090 buflen = arg->width;
15091 if (arg->sign && len == arg->width)
15092 buflen++;
15093 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15094 return -1;
15095
15096 /* Write the sign if needed */
15097 if (arg->sign) {
15098 if (fill != ' ') {
15099 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15100 writer->pos += 1;
15101 }
15102 if (arg->width > len)
15103 arg->width--;
15104 }
15105
15106 /* Write the numeric prefix for "x", "X" and "o" formats
15107 if the alternate form is used.
15108 For example, write "0x" for the "%#x" format. */
15109 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15110 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15111 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15112 if (fill != ' ') {
15113 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15114 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15115 writer->pos += 2;
15116 pindex += 2;
15117 }
15118 arg->width -= 2;
15119 if (arg->width < 0)
15120 arg->width = 0;
15121 len -= 2;
15122 }
15123
15124 /* Pad left with the fill character if needed */
15125 if (arg->width > len && !(arg->flags & F_LJUST)) {
15126 sublen = arg->width - len;
15127 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15128 writer->pos += sublen;
15129 arg->width = len;
15130 }
15131
15132 /* If padding with spaces: write sign if needed and/or numeric prefix if
15133 the alternate form is used */
15134 if (fill == ' ') {
15135 if (arg->sign) {
15136 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15137 writer->pos += 1;
15138 }
15139 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15140 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15141 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15142 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15143 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15144 writer->pos += 2;
15145 pindex += 2;
15146 }
15147 }
15148
15149 /* Write characters */
15150 if (len) {
15151 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15152 str, pindex, len);
15153 writer->pos += len;
15154 }
15155
15156 /* Pad right with the fill character if needed */
15157 if (arg->width > len) {
15158 sublen = arg->width - len;
15159 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15160 writer->pos += sublen;
15161 }
15162 return 0;
15163 }
15164
15165 /* Helper of PyUnicode_Format(): format one arg.
15166 Return 0 on success, raise an exception and return -1 on error. */
15167 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15168 unicode_format_arg(struct unicode_formatter_t *ctx)
15169 {
15170 struct unicode_format_arg_t arg;
15171 PyObject *str;
15172 int ret;
15173
15174 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15175 if (arg.ch == '%') {
15176 ctx->fmtpos++;
15177 ctx->fmtcnt--;
15178 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15179 return -1;
15180 return 0;
15181 }
15182 arg.flags = 0;
15183 arg.width = -1;
15184 arg.prec = -1;
15185 arg.sign = 0;
15186 str = NULL;
15187
15188 ret = unicode_format_arg_parse(ctx, &arg);
15189 if (ret == -1)
15190 return -1;
15191
15192 ret = unicode_format_arg_format(ctx, &arg, &str);
15193 if (ret == -1)
15194 return -1;
15195
15196 if (ret != 1) {
15197 ret = unicode_format_arg_output(ctx, &arg, str);
15198 Py_DECREF(str);
15199 if (ret == -1)
15200 return -1;
15201 }
15202
15203 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15204 PyErr_SetString(PyExc_TypeError,
15205 "not all arguments converted during string formatting");
15206 return -1;
15207 }
15208 return 0;
15209 }
15210
15211 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15212 PyUnicode_Format(PyObject *format, PyObject *args)
15213 {
15214 struct unicode_formatter_t ctx;
15215
15216 if (format == NULL || args == NULL) {
15217 PyErr_BadInternalCall();
15218 return NULL;
15219 }
15220
15221 if (ensure_unicode(format) < 0)
15222 return NULL;
15223
15224 ctx.fmtstr = format;
15225 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15226 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15227 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15228 ctx.fmtpos = 0;
15229
15230 _PyUnicodeWriter_Init(&ctx.writer);
15231 ctx.writer.min_length = ctx.fmtcnt + 100;
15232 ctx.writer.overallocate = 1;
15233
15234 if (PyTuple_Check(args)) {
15235 ctx.arglen = PyTuple_Size(args);
15236 ctx.argidx = 0;
15237 }
15238 else {
15239 ctx.arglen = -1;
15240 ctx.argidx = -2;
15241 }
15242 ctx.args_owned = 0;
15243 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15244 ctx.dict = args;
15245 else
15246 ctx.dict = NULL;
15247 ctx.args = args;
15248
15249 while (--ctx.fmtcnt >= 0) {
15250 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15251 Py_ssize_t nonfmtpos;
15252
15253 nonfmtpos = ctx.fmtpos++;
15254 while (ctx.fmtcnt >= 0 &&
15255 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15256 ctx.fmtpos++;
15257 ctx.fmtcnt--;
15258 }
15259 if (ctx.fmtcnt < 0) {
15260 ctx.fmtpos--;
15261 ctx.writer.overallocate = 0;
15262 }
15263
15264 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15265 nonfmtpos, ctx.fmtpos) < 0)
15266 goto onError;
15267 }
15268 else {
15269 ctx.fmtpos++;
15270 if (unicode_format_arg(&ctx) == -1)
15271 goto onError;
15272 }
15273 }
15274
15275 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15276 PyErr_SetString(PyExc_TypeError,
15277 "not all arguments converted during string formatting");
15278 goto onError;
15279 }
15280
15281 if (ctx.args_owned) {
15282 Py_DECREF(ctx.args);
15283 }
15284 return _PyUnicodeWriter_Finish(&ctx.writer);
15285
15286 onError:
15287 _PyUnicodeWriter_Dealloc(&ctx.writer);
15288 if (ctx.args_owned) {
15289 Py_DECREF(ctx.args);
15290 }
15291 return NULL;
15292 }
15293
15294 static PyObject *
15295 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15296
15297 /*[clinic input]
15298 @classmethod
15299 str.__new__ as unicode_new
15300
15301 object as x: object = NULL
15302 encoding: str = NULL
15303 errors: str = NULL
15304
15305 [clinic start generated code]*/
15306
15307 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15308 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15309 const char *errors)
15310 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15311 {
15312 PyObject *unicode;
15313 if (x == NULL) {
15314 unicode = unicode_new_empty();
15315 }
15316 else if (encoding == NULL && errors == NULL) {
15317 unicode = PyObject_Str(x);
15318 }
15319 else {
15320 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15321 }
15322
15323 if (unicode != NULL && type != &PyUnicode_Type) {
15324 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15325 }
15326 return unicode;
15327 }
15328
15329 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15330 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15331 {
15332 PyObject *self;
15333 Py_ssize_t length, char_size;
15334 int share_wstr, share_utf8;
15335 unsigned int kind;
15336 void *data;
15337
15338 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15339 assert(_PyUnicode_CHECK(unicode));
15340 if (PyUnicode_READY(unicode) == -1) {
15341 return NULL;
15342 }
15343
15344 self = type->tp_alloc(type, 0);
15345 if (self == NULL) {
15346 return NULL;
15347 }
15348 kind = PyUnicode_KIND(unicode);
15349 length = PyUnicode_GET_LENGTH(unicode);
15350
15351 _PyUnicode_LENGTH(self) = length;
15352 #ifdef Py_DEBUG
15353 _PyUnicode_HASH(self) = -1;
15354 #else
15355 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15356 #endif
15357 _PyUnicode_STATE(self).interned = 0;
15358 _PyUnicode_STATE(self).kind = kind;
15359 _PyUnicode_STATE(self).compact = 0;
15360 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15361 _PyUnicode_STATE(self).ready = 1;
15362 _PyUnicode_WSTR(self) = NULL;
15363 _PyUnicode_UTF8_LENGTH(self) = 0;
15364 _PyUnicode_UTF8(self) = NULL;
15365 _PyUnicode_WSTR_LENGTH(self) = 0;
15366 _PyUnicode_DATA_ANY(self) = NULL;
15367
15368 share_utf8 = 0;
15369 share_wstr = 0;
15370 if (kind == PyUnicode_1BYTE_KIND) {
15371 char_size = 1;
15372 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15373 share_utf8 = 1;
15374 }
15375 else if (kind == PyUnicode_2BYTE_KIND) {
15376 char_size = 2;
15377 if (sizeof(wchar_t) == 2)
15378 share_wstr = 1;
15379 }
15380 else {
15381 assert(kind == PyUnicode_4BYTE_KIND);
15382 char_size = 4;
15383 if (sizeof(wchar_t) == 4)
15384 share_wstr = 1;
15385 }
15386
15387 /* Ensure we won't overflow the length. */
15388 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15389 PyErr_NoMemory();
15390 goto onError;
15391 }
15392 data = PyObject_Malloc((length + 1) * char_size);
15393 if (data == NULL) {
15394 PyErr_NoMemory();
15395 goto onError;
15396 }
15397
15398 _PyUnicode_DATA_ANY(self) = data;
15399 if (share_utf8) {
15400 _PyUnicode_UTF8_LENGTH(self) = length;
15401 _PyUnicode_UTF8(self) = data;
15402 }
15403 if (share_wstr) {
15404 _PyUnicode_WSTR_LENGTH(self) = length;
15405 _PyUnicode_WSTR(self) = (wchar_t *)data;
15406 }
15407
15408 memcpy(data, PyUnicode_DATA(unicode),
15409 kind * (length + 1));
15410 assert(_PyUnicode_CheckConsistency(self, 1));
15411 #ifdef Py_DEBUG
15412 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15413 #endif
15414 return self;
15415
15416 onError:
15417 Py_DECREF(self);
15418 return NULL;
15419 }
15420
15421 void
_PyUnicode_ExactDealloc(PyObject * op)15422 _PyUnicode_ExactDealloc(PyObject *op)
15423 {
15424 assert(PyUnicode_CheckExact(op));
15425 unicode_dealloc(op);
15426 }
15427
15428 PyDoc_STRVAR(unicode_doc,
15429 "str(object='') -> str\n\
15430 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15431 \n\
15432 Create a new string object from the given object. If encoding or\n\
15433 errors is specified, then the object must expose a data buffer\n\
15434 that will be decoded using the given encoding and error handler.\n\
15435 Otherwise, returns the result of object.__str__() (if defined)\n\
15436 or repr(object).\n\
15437 encoding defaults to sys.getdefaultencoding().\n\
15438 errors defaults to 'strict'.");
15439
15440 static PyObject *unicode_iter(PyObject *seq);
15441
15442 PyTypeObject PyUnicode_Type = {
15443 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15444 "str", /* tp_name */
15445 sizeof(PyUnicodeObject), /* tp_basicsize */
15446 0, /* tp_itemsize */
15447 /* Slots */
15448 (destructor)unicode_dealloc, /* tp_dealloc */
15449 0, /* tp_vectorcall_offset */
15450 0, /* tp_getattr */
15451 0, /* tp_setattr */
15452 0, /* tp_as_async */
15453 unicode_repr, /* tp_repr */
15454 &unicode_as_number, /* tp_as_number */
15455 &unicode_as_sequence, /* tp_as_sequence */
15456 &unicode_as_mapping, /* tp_as_mapping */
15457 (hashfunc) unicode_hash, /* tp_hash*/
15458 0, /* tp_call*/
15459 (reprfunc) unicode_str, /* tp_str */
15460 PyObject_GenericGetAttr, /* tp_getattro */
15461 0, /* tp_setattro */
15462 0, /* tp_as_buffer */
15463 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15464 Py_TPFLAGS_UNICODE_SUBCLASS |
15465 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15466 unicode_doc, /* tp_doc */
15467 0, /* tp_traverse */
15468 0, /* tp_clear */
15469 PyUnicode_RichCompare, /* tp_richcompare */
15470 0, /* tp_weaklistoffset */
15471 unicode_iter, /* tp_iter */
15472 0, /* tp_iternext */
15473 unicode_methods, /* tp_methods */
15474 0, /* tp_members */
15475 0, /* tp_getset */
15476 0, /* tp_base */
15477 0, /* tp_dict */
15478 0, /* tp_descr_get */
15479 0, /* tp_descr_set */
15480 0, /* tp_dictoffset */
15481 0, /* tp_init */
15482 0, /* tp_alloc */
15483 unicode_new, /* tp_new */
15484 PyObject_Del, /* tp_free */
15485 };
15486
15487 /* Initialize the Unicode implementation */
15488
15489 void
_PyUnicode_InitState(PyInterpreterState * interp)15490 _PyUnicode_InitState(PyInterpreterState *interp)
15491 {
15492 if (!_Py_IsMainInterpreter(interp)) {
15493 return;
15494 }
15495
15496 /* initialize the linebreak bloom filter */
15497 const Py_UCS2 linebreak[] = {
15498 0x000A, /* LINE FEED */
15499 0x000D, /* CARRIAGE RETURN */
15500 0x001C, /* FILE SEPARATOR */
15501 0x001D, /* GROUP SEPARATOR */
15502 0x001E, /* RECORD SEPARATOR */
15503 0x0085, /* NEXT LINE */
15504 0x2028, /* LINE SEPARATOR */
15505 0x2029, /* PARAGRAPH SEPARATOR */
15506 };
15507 bloom_linebreak = make_bloom_mask(
15508 PyUnicode_2BYTE_KIND, linebreak,
15509 Py_ARRAY_LENGTH(linebreak));
15510 }
15511
15512
15513 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15514 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15515 {
15516 if (!_Py_IsMainInterpreter(interp)) {
15517 return _PyStatus_OK();
15518 }
15519
15520 #ifdef Py_DEBUG
15521 assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15522
15523 for (int i = 0; i < 256; i++) {
15524 assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15525 }
15526 #endif
15527
15528 return _PyStatus_OK();
15529 }
15530
15531
15532 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15533 _PyUnicode_InitTypes(PyInterpreterState *interp)
15534 {
15535 if (!_Py_IsMainInterpreter(interp)) {
15536 return _PyStatus_OK();
15537 }
15538
15539 if (PyType_Ready(&EncodingMapType) < 0) {
15540 goto error;
15541 }
15542 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15543 goto error;
15544 }
15545 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15546 goto error;
15547 }
15548 return _PyStatus_OK();
15549
15550 error:
15551 return _PyStatus_ERR("Can't initialize unicode types");
15552 }
15553
15554
15555 void
PyUnicode_InternInPlace(PyObject ** p)15556 PyUnicode_InternInPlace(PyObject **p)
15557 {
15558 PyObject *s = *p;
15559 #ifdef Py_DEBUG
15560 assert(s != NULL);
15561 assert(_PyUnicode_CHECK(s));
15562 #else
15563 if (s == NULL || !PyUnicode_Check(s)) {
15564 return;
15565 }
15566 #endif
15567
15568 /* If it's a subclass, we don't really know what putting
15569 it in the interned dict might do. */
15570 if (!PyUnicode_CheckExact(s)) {
15571 return;
15572 }
15573
15574 if (PyUnicode_CHECK_INTERNED(s)) {
15575 return;
15576 }
15577
15578 if (PyUnicode_READY(s) == -1) {
15579 PyErr_Clear();
15580 return;
15581 }
15582
15583 if (interned == NULL) {
15584 interned = PyDict_New();
15585 if (interned == NULL) {
15586 PyErr_Clear(); /* Don't leave an exception */
15587 return;
15588 }
15589 }
15590
15591 PyObject *t = PyDict_SetDefault(interned, s, s);
15592 if (t == NULL) {
15593 PyErr_Clear();
15594 return;
15595 }
15596
15597 if (t != s) {
15598 Py_INCREF(t);
15599 Py_SETREF(*p, t);
15600 return;
15601 }
15602
15603 /* The two references in interned dict (key and value) are not counted by
15604 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15605 this. */
15606 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15607 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15608 }
15609
15610 void
PyUnicode_InternImmortal(PyObject ** p)15611 PyUnicode_InternImmortal(PyObject **p)
15612 {
15613 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15614 "PyUnicode_InternImmortal() is deprecated; "
15615 "use PyUnicode_InternInPlace() instead", 1) < 0)
15616 {
15617 // The function has no return value, the exception cannot
15618 // be reported to the caller, so just log it.
15619 PyErr_WriteUnraisable(NULL);
15620 }
15621
15622 PyUnicode_InternInPlace(p);
15623 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15624 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15625 Py_INCREF(*p);
15626 }
15627 }
15628
15629 PyObject *
PyUnicode_InternFromString(const char * cp)15630 PyUnicode_InternFromString(const char *cp)
15631 {
15632 PyObject *s = PyUnicode_FromString(cp);
15633 if (s == NULL)
15634 return NULL;
15635 PyUnicode_InternInPlace(&s);
15636 return s;
15637 }
15638
15639
15640 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15641 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15642 {
15643 if (!_Py_IsMainInterpreter(interp)) {
15644 // interned dict is shared by all interpreters
15645 return;
15646 }
15647
15648 if (interned == NULL) {
15649 return;
15650 }
15651 assert(PyDict_CheckExact(interned));
15652
15653 /* Interned unicode strings are not forcibly deallocated; rather, we give
15654 them their stolen references back, and then clear and DECREF the
15655 interned dict. */
15656
15657 #ifdef INTERNED_STATS
15658 fprintf(stderr, "releasing %zd interned strings\n",
15659 PyDict_GET_SIZE(interned));
15660
15661 Py_ssize_t immortal_size = 0, mortal_size = 0;
15662 #endif
15663 Py_ssize_t pos = 0;
15664 PyObject *s, *ignored_value;
15665 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15666 assert(PyUnicode_IS_READY(s));
15667
15668 switch (PyUnicode_CHECK_INTERNED(s)) {
15669 case SSTATE_INTERNED_IMMORTAL:
15670 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15671 #ifdef INTERNED_STATS
15672 immortal_size += PyUnicode_GET_LENGTH(s);
15673 #endif
15674 break;
15675 case SSTATE_INTERNED_MORTAL:
15676 // Restore the two references (key and value) ignored
15677 // by PyUnicode_InternInPlace().
15678 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15679 #ifdef INTERNED_STATS
15680 mortal_size += PyUnicode_GET_LENGTH(s);
15681 #endif
15682 break;
15683 case SSTATE_NOT_INTERNED:
15684 /* fall through */
15685 default:
15686 Py_UNREACHABLE();
15687 }
15688 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15689 }
15690 #ifdef INTERNED_STATS
15691 fprintf(stderr,
15692 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15693 mortal_size, immortal_size);
15694 #endif
15695
15696 PyDict_Clear(interned);
15697 Py_CLEAR(interned);
15698 }
15699
15700
15701 /********************* Unicode Iterator **************************/
15702
15703 typedef struct {
15704 PyObject_HEAD
15705 Py_ssize_t it_index;
15706 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15707 } unicodeiterobject;
15708
15709 static void
unicodeiter_dealloc(unicodeiterobject * it)15710 unicodeiter_dealloc(unicodeiterobject *it)
15711 {
15712 _PyObject_GC_UNTRACK(it);
15713 Py_XDECREF(it->it_seq);
15714 PyObject_GC_Del(it);
15715 }
15716
15717 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15718 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15719 {
15720 Py_VISIT(it->it_seq);
15721 return 0;
15722 }
15723
15724 static PyObject *
unicodeiter_next(unicodeiterobject * it)15725 unicodeiter_next(unicodeiterobject *it)
15726 {
15727 PyObject *seq;
15728
15729 assert(it != NULL);
15730 seq = it->it_seq;
15731 if (seq == NULL)
15732 return NULL;
15733 assert(_PyUnicode_CHECK(seq));
15734
15735 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15736 int kind = PyUnicode_KIND(seq);
15737 const void *data = PyUnicode_DATA(seq);
15738 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15739 it->it_index++;
15740 return unicode_char(chr);
15741 }
15742
15743 it->it_seq = NULL;
15744 Py_DECREF(seq);
15745 return NULL;
15746 }
15747
15748 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15749 unicode_ascii_iter_next(unicodeiterobject *it)
15750 {
15751 assert(it != NULL);
15752 PyObject *seq = it->it_seq;
15753 if (seq == NULL) {
15754 return NULL;
15755 }
15756 assert(_PyUnicode_CHECK(seq));
15757 assert(PyUnicode_IS_COMPACT_ASCII(seq));
15758 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15759 const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15760 Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15761 data, it->it_index);
15762 it->it_index++;
15763 PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15764 return Py_NewRef(item);
15765 }
15766 it->it_seq = NULL;
15767 Py_DECREF(seq);
15768 return NULL;
15769 }
15770
15771 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15772 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15773 {
15774 Py_ssize_t len = 0;
15775 if (it->it_seq)
15776 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15777 return PyLong_FromSsize_t(len);
15778 }
15779
15780 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15781
15782 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15783 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15784 {
15785 PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15786
15787 /* _PyEval_GetBuiltin can invoke arbitrary code,
15788 * call must be before access of iterator pointers.
15789 * see issue #101765 */
15790
15791 if (it->it_seq != NULL) {
15792 return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15793 } else {
15794 PyObject *u = (PyObject *)_PyUnicode_New(0);
15795 if (u == NULL) {
15796 Py_XDECREF(iter);
15797 return NULL;
15798 }
15799 return Py_BuildValue("N(N)", iter, u);
15800 }
15801 }
15802
15803 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15804
15805 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15806 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15807 {
15808 Py_ssize_t index = PyLong_AsSsize_t(state);
15809 if (index == -1 && PyErr_Occurred())
15810 return NULL;
15811 if (it->it_seq != NULL) {
15812 if (index < 0)
15813 index = 0;
15814 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15815 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15816 it->it_index = index;
15817 }
15818 Py_RETURN_NONE;
15819 }
15820
15821 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15822
15823 static PyMethodDef unicodeiter_methods[] = {
15824 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15825 length_hint_doc},
15826 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15827 reduce_doc},
15828 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15829 setstate_doc},
15830 {NULL, NULL} /* sentinel */
15831 };
15832
15833 PyTypeObject PyUnicodeIter_Type = {
15834 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15835 "str_iterator", /* tp_name */
15836 sizeof(unicodeiterobject), /* tp_basicsize */
15837 0, /* tp_itemsize */
15838 /* methods */
15839 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15840 0, /* tp_vectorcall_offset */
15841 0, /* tp_getattr */
15842 0, /* tp_setattr */
15843 0, /* tp_as_async */
15844 0, /* tp_repr */
15845 0, /* tp_as_number */
15846 0, /* tp_as_sequence */
15847 0, /* tp_as_mapping */
15848 0, /* tp_hash */
15849 0, /* tp_call */
15850 0, /* tp_str */
15851 PyObject_GenericGetAttr, /* tp_getattro */
15852 0, /* tp_setattro */
15853 0, /* tp_as_buffer */
15854 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15855 0, /* tp_doc */
15856 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15857 0, /* tp_clear */
15858 0, /* tp_richcompare */
15859 0, /* tp_weaklistoffset */
15860 PyObject_SelfIter, /* tp_iter */
15861 (iternextfunc)unicodeiter_next, /* tp_iternext */
15862 unicodeiter_methods, /* tp_methods */
15863 0,
15864 };
15865
15866 PyTypeObject _PyUnicodeASCIIIter_Type = {
15867 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15868 .tp_name = "str_ascii_iterator",
15869 .tp_basicsize = sizeof(unicodeiterobject),
15870 .tp_dealloc = (destructor)unicodeiter_dealloc,
15871 .tp_getattro = PyObject_GenericGetAttr,
15872 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15873 .tp_traverse = (traverseproc)unicodeiter_traverse,
15874 .tp_iter = PyObject_SelfIter,
15875 .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15876 .tp_methods = unicodeiter_methods,
15877 };
15878
15879 static PyObject *
unicode_iter(PyObject * seq)15880 unicode_iter(PyObject *seq)
15881 {
15882 unicodeiterobject *it;
15883
15884 if (!PyUnicode_Check(seq)) {
15885 PyErr_BadInternalCall();
15886 return NULL;
15887 }
15888 if (PyUnicode_READY(seq) == -1)
15889 return NULL;
15890 if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15891 it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15892 }
15893 else {
15894 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15895 }
15896 if (it == NULL)
15897 return NULL;
15898 it->it_index = 0;
15899 Py_INCREF(seq);
15900 it->it_seq = seq;
15901 _PyObject_GC_TRACK(it);
15902 return (PyObject *)it;
15903 }
15904
15905 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15906 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15907 {
15908 int res;
15909 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15910 if (res == -2) {
15911 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15912 return -1;
15913 }
15914 if (res < 0) {
15915 PyErr_NoMemory();
15916 return -1;
15917 }
15918 return 0;
15919 }
15920
15921
15922 static int
config_get_codec_name(wchar_t ** config_encoding)15923 config_get_codec_name(wchar_t **config_encoding)
15924 {
15925 char *encoding;
15926 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15927 return -1;
15928 }
15929
15930 PyObject *name_obj = NULL;
15931 PyObject *codec = _PyCodec_Lookup(encoding);
15932 PyMem_RawFree(encoding);
15933
15934 if (!codec)
15935 goto error;
15936
15937 name_obj = PyObject_GetAttrString(codec, "name");
15938 Py_CLEAR(codec);
15939 if (!name_obj) {
15940 goto error;
15941 }
15942
15943 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15944 Py_DECREF(name_obj);
15945 if (wname == NULL) {
15946 goto error;
15947 }
15948
15949 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15950 if (raw_wname == NULL) {
15951 PyMem_Free(wname);
15952 PyErr_NoMemory();
15953 goto error;
15954 }
15955
15956 PyMem_RawFree(*config_encoding);
15957 *config_encoding = raw_wname;
15958
15959 PyMem_Free(wname);
15960 return 0;
15961
15962 error:
15963 Py_XDECREF(codec);
15964 Py_XDECREF(name_obj);
15965 return -1;
15966 }
15967
15968
15969 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15970 init_stdio_encoding(PyInterpreterState *interp)
15971 {
15972 /* Update the stdio encoding to the normalized Python codec name. */
15973 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15974 if (config_get_codec_name(&config->stdio_encoding) < 0) {
15975 return _PyStatus_ERR("failed to get the Python codec name "
15976 "of the stdio encoding");
15977 }
15978 return _PyStatus_OK();
15979 }
15980
15981
15982 static int
init_fs_codec(PyInterpreterState * interp)15983 init_fs_codec(PyInterpreterState *interp)
15984 {
15985 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15986
15987 _Py_error_handler error_handler;
15988 error_handler = get_error_handler_wide(config->filesystem_errors);
15989 if (error_handler == _Py_ERROR_UNKNOWN) {
15990 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15991 return -1;
15992 }
15993
15994 char *encoding, *errors;
15995 if (encode_wstr_utf8(config->filesystem_encoding,
15996 &encoding,
15997 "filesystem_encoding") < 0) {
15998 return -1;
15999 }
16000
16001 if (encode_wstr_utf8(config->filesystem_errors,
16002 &errors,
16003 "filesystem_errors") < 0) {
16004 PyMem_RawFree(encoding);
16005 return -1;
16006 }
16007
16008 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16009 PyMem_RawFree(fs_codec->encoding);
16010 fs_codec->encoding = encoding;
16011 /* encoding has been normalized by init_fs_encoding() */
16012 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16013 PyMem_RawFree(fs_codec->errors);
16014 fs_codec->errors = errors;
16015 fs_codec->error_handler = error_handler;
16016
16017 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16018 assert(fs_codec->utf8 == 1);
16019 #endif
16020
16021 /* At this point, PyUnicode_EncodeFSDefault() and
16022 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16023 the C implementation of the filesystem encoding. */
16024
16025 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16026 global configuration variables. */
16027 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16028 fs_codec->errors) < 0) {
16029 PyErr_NoMemory();
16030 return -1;
16031 }
16032 return 0;
16033 }
16034
16035
16036 static PyStatus
init_fs_encoding(PyThreadState * tstate)16037 init_fs_encoding(PyThreadState *tstate)
16038 {
16039 PyInterpreterState *interp = tstate->interp;
16040
16041 /* Update the filesystem encoding to the normalized Python codec name.
16042 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16043 (Python codec name). */
16044 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16045 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16046 _Py_DumpPathConfig(tstate);
16047 return _PyStatus_ERR("failed to get the Python codec "
16048 "of the filesystem encoding");
16049 }
16050
16051 if (init_fs_codec(interp) < 0) {
16052 return _PyStatus_ERR("cannot initialize filesystem codec");
16053 }
16054 return _PyStatus_OK();
16055 }
16056
16057
16058 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16059 _PyUnicode_InitEncodings(PyThreadState *tstate)
16060 {
16061 PyStatus status = init_fs_encoding(tstate);
16062 if (_PyStatus_EXCEPTION(status)) {
16063 return status;
16064 }
16065
16066 return init_stdio_encoding(tstate->interp);
16067 }
16068
16069
16070 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16071 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16072 {
16073 PyMem_RawFree(fs_codec->encoding);
16074 fs_codec->encoding = NULL;
16075 fs_codec->utf8 = 0;
16076 PyMem_RawFree(fs_codec->errors);
16077 fs_codec->errors = NULL;
16078 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16079 }
16080
16081
16082 #ifdef MS_WINDOWS
16083 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16084 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16085 {
16086 PyInterpreterState *interp = _PyInterpreterState_GET();
16087 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16088
16089 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16090 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16091 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16092 if (encoding == NULL || errors == NULL) {
16093 PyMem_RawFree(encoding);
16094 PyMem_RawFree(errors);
16095 PyErr_NoMemory();
16096 return -1;
16097 }
16098
16099 PyMem_RawFree(config->filesystem_encoding);
16100 config->filesystem_encoding = encoding;
16101 PyMem_RawFree(config->filesystem_errors);
16102 config->filesystem_errors = errors;
16103
16104 return init_fs_codec(interp);
16105 }
16106 #endif
16107
16108
16109 #ifdef Py_DEBUG
16110 static inline int
unicode_is_finalizing(void)16111 unicode_is_finalizing(void)
16112 {
16113 return (interned == NULL);
16114 }
16115 #endif
16116
16117
16118 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)16119 _PyUnicode_FiniTypes(PyInterpreterState *interp)
16120 {
16121 if (!_Py_IsMainInterpreter(interp)) {
16122 return;
16123 }
16124
16125 _PyStaticType_Dealloc(&EncodingMapType);
16126 _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16127 _PyStaticType_Dealloc(&PyFormatterIter_Type);
16128 }
16129
16130
unicode_static_dealloc(PyObject * op)16131 static void unicode_static_dealloc(PyObject *op)
16132 {
16133 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16134
16135 assert(ascii->state.compact);
16136
16137 if (ascii->state.ascii) {
16138 if (ascii->wstr) {
16139 PyObject_Free(ascii->wstr);
16140 ascii->wstr = NULL;
16141 }
16142 }
16143 else {
16144 PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16145 void* data = (void*)(compact + 1);
16146 if (ascii->wstr && ascii->wstr != data) {
16147 PyObject_Free(ascii->wstr);
16148 ascii->wstr = NULL;
16149 compact->wstr_length = 0;
16150 }
16151 if (compact->utf8) {
16152 PyObject_Free(compact->utf8);
16153 compact->utf8 = NULL;
16154 compact->utf8_length = 0;
16155 }
16156 }
16157 }
16158
16159
16160 void
_PyUnicode_Fini(PyInterpreterState * interp)16161 _PyUnicode_Fini(PyInterpreterState *interp)
16162 {
16163 struct _Py_unicode_state *state = &interp->unicode;
16164
16165 if (_Py_IsMainInterpreter(interp)) {
16166 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16167 assert(interned == NULL);
16168 // bpo-47182: force a unicodedata CAPI capsule re-import on
16169 // subsequent initialization of main interpreter.
16170 ucnhash_capi = NULL;
16171 }
16172
16173 _PyUnicode_FiniEncodings(&state->fs_codec);
16174
16175 unicode_clear_identifiers(state);
16176
16177 // Clear the single character singletons
16178 for (int i = 0; i < 128; i++) {
16179 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16180 }
16181 for (int i = 0; i < 128; i++) {
16182 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16183 }
16184 }
16185
16186
16187 void
_PyStaticUnicode_Dealloc(PyObject * op)16188 _PyStaticUnicode_Dealloc(PyObject *op)
16189 {
16190 unicode_static_dealloc(op);
16191 }
16192
16193
16194 /* A _string module, to export formatter_parser and formatter_field_name_split
16195 to the string.Formatter class implemented in Python. */
16196
16197 static PyMethodDef _string_methods[] = {
16198 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16199 METH_O, PyDoc_STR("split the argument as a field name")},
16200 {"formatter_parser", (PyCFunction) formatter_parser,
16201 METH_O, PyDoc_STR("parse the argument as a format string")},
16202 {NULL, NULL}
16203 };
16204
16205 static struct PyModuleDef _string_module = {
16206 PyModuleDef_HEAD_INIT,
16207 .m_name = "_string",
16208 .m_doc = PyDoc_STR("string helper module"),
16209 .m_size = 0,
16210 .m_methods = _string_methods,
16211 };
16212
16213 PyMODINIT_FUNC
PyInit__string(void)16214 PyInit__string(void)
16215 {
16216 return PyModuleDef_Init(&_string_module);
16217 }
16218
16219
16220 #ifdef __cplusplus
16221 }
16222 #endif
16223