1 #ifndef Py_CPYTHON_UNICODEOBJECT_H
2 # error "this header file must not be included directly"
3 #endif
4
5 /* Py_UNICODE was the native Unicode storage format (code unit) used by
6 Python and represents a single Unicode element in the Unicode type.
7 With PEP 393, Py_UNICODE is deprecated and replaced with a
8 typedef to wchar_t. */
9 Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
10 Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
11
12
13 /* --- Internal Unicode Operations ---------------------------------------- */
14
15 // Static inline functions to work with surrogates
Py_UNICODE_IS_SURROGATE(Py_UCS4 ch)16 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
17 return (0xD800 <= ch && ch <= 0xDFFF);
18 }
Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch)19 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
20 return (0xD800 <= ch && ch <= 0xDBFF);
21 }
Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch)22 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
23 return (0xDC00 <= ch && ch <= 0xDFFF);
24 }
25
26 // Join two surrogate characters and return a single Py_UCS4 value.
Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high,Py_UCS4 low)27 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
28 assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
29 assert(Py_UNICODE_IS_LOW_SURROGATE(low));
30 return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
31 }
32
33 // High surrogate = top 10 bits added to 0xD800.
34 // The character must be in the range [U+10000; U+10ffff].
Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch)35 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
36 assert(0x10000 <= ch && ch <= 0x10ffff);
37 return (0xD800 - (0x10000 >> 10) + (ch >> 10));
38 }
39
40 // Low surrogate = bottom 10 bits added to 0xDC00.
41 // The character must be in the range [U+10000; U+10ffff].
Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch)42 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
43 assert(0x10000 <= ch && ch <= 0x10ffff);
44 return (0xDC00 + (ch & 0x3FF));
45 }
46
47
48 /* --- Unicode Type ------------------------------------------------------- */
49
50 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
51 structure. state.ascii and state.compact are set, and the data
52 immediately follow the structure. utf8_length can be found
53 in the length field; the utf8 pointer is equal to the data pointer. */
54 typedef struct {
55 /* There are 4 forms of Unicode strings:
56
57 - compact ascii:
58
59 * structure = PyASCIIObject
60 * test: PyUnicode_IS_COMPACT_ASCII(op)
61 * kind = PyUnicode_1BYTE_KIND
62 * compact = 1
63 * ascii = 1
64 * (length is the length of the utf8)
65 * (data starts just after the structure)
66 * (since ASCII is decoded from UTF-8, the utf8 string are the data)
67
68 - compact:
69
70 * structure = PyCompactUnicodeObject
71 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
72 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
73 PyUnicode_4BYTE_KIND
74 * compact = 1
75 * ascii = 0
76 * utf8 is not shared with data
77 * utf8_length = 0 if utf8 is NULL
78 * (data starts just after the structure)
79
80 - legacy string:
81
82 * structure = PyUnicodeObject structure
83 * test: !PyUnicode_IS_COMPACT(op)
84 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
85 PyUnicode_4BYTE_KIND
86 * compact = 0
87 * data.any is not NULL
88 * utf8 is shared and utf8_length = length with data.any if ascii = 1
89 * utf8_length = 0 if utf8 is NULL
90
91 Compact strings use only one memory block (structure + characters),
92 whereas legacy strings use one block for the structure and one block
93 for characters.
94
95 Legacy strings are created by subclasses of Unicode.
96
97 See also _PyUnicode_CheckConsistency().
98 */
99 PyObject_HEAD
100 Py_ssize_t length; /* Number of code points in the string */
101 Py_hash_t hash; /* Hash value; -1 if not set */
102 struct {
103 /* If interned is non-zero, the two references from the
104 dictionary to this object are *not* counted in ob_refcnt.
105 The possible values here are:
106 0: Not Interned
107 1: Interned
108 2: Interned and Immortal
109 3: Interned, Immortal, and Static
110 This categorization allows the runtime to determine the right
111 cleanup mechanism at runtime shutdown. */
112 unsigned int interned:2;
113 /* Character size:
114
115 - PyUnicode_1BYTE_KIND (1):
116
117 * character type = Py_UCS1 (8 bits, unsigned)
118 * all characters are in the range U+0000-U+00FF (latin1)
119 * if ascii is set, all characters are in the range U+0000-U+007F
120 (ASCII), otherwise at least one character is in the range
121 U+0080-U+00FF
122
123 - PyUnicode_2BYTE_KIND (2):
124
125 * character type = Py_UCS2 (16 bits, unsigned)
126 * all characters are in the range U+0000-U+FFFF (BMP)
127 * at least one character is in the range U+0100-U+FFFF
128
129 - PyUnicode_4BYTE_KIND (4):
130
131 * character type = Py_UCS4 (32 bits, unsigned)
132 * all characters are in the range U+0000-U+10FFFF
133 * at least one character is in the range U+10000-U+10FFFF
134 */
135 unsigned int kind:3;
136 /* Compact is with respect to the allocation scheme. Compact unicode
137 objects only require one memory block while non-compact objects use
138 one block for the PyUnicodeObject struct and another for its data
139 buffer. */
140 unsigned int compact:1;
141 /* The string only contains characters in the range U+0000-U+007F (ASCII)
142 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
143 set, use the PyASCIIObject structure. */
144 unsigned int ascii:1;
145 /* The object is statically allocated. */
146 unsigned int statically_allocated:1;
147 /* Padding to ensure that PyUnicode_DATA() is always aligned to
148 4 bytes (see issue #19537 on m68k). */
149 unsigned int :24;
150 } state;
151 } PyASCIIObject;
152
153 /* Non-ASCII strings allocated through PyUnicode_New use the
154 PyCompactUnicodeObject structure. state.compact is set, and the data
155 immediately follow the structure. */
156 typedef struct {
157 PyASCIIObject _base;
158 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
159 * terminating \0. */
160 char *utf8; /* UTF-8 representation (null-terminated) */
161 } PyCompactUnicodeObject;
162
163 /* Object format for Unicode subclasses. */
164 typedef struct {
165 PyCompactUnicodeObject _base;
166 union {
167 void *any;
168 Py_UCS1 *latin1;
169 Py_UCS2 *ucs2;
170 Py_UCS4 *ucs4;
171 } data; /* Canonical, smallest-form Unicode buffer */
172 } PyUnicodeObject;
173
174
175 #define _PyASCIIObject_CAST(op) \
176 (assert(PyUnicode_Check(op)), \
177 _Py_CAST(PyASCIIObject*, (op)))
178 #define _PyCompactUnicodeObject_CAST(op) \
179 (assert(PyUnicode_Check(op)), \
180 _Py_CAST(PyCompactUnicodeObject*, (op)))
181 #define _PyUnicodeObject_CAST(op) \
182 (assert(PyUnicode_Check(op)), \
183 _Py_CAST(PyUnicodeObject*, (op)))
184
185
186 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
187
188 /* Values for PyASCIIObject.state: */
189
190 /* Interning state. */
191 #define SSTATE_NOT_INTERNED 0
192 #define SSTATE_INTERNED_MORTAL 1
193 #define SSTATE_INTERNED_IMMORTAL 2
194 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
195
196 /* Use only if you know it's a string */
PyUnicode_CHECK_INTERNED(PyObject * op)197 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
198 return _PyASCIIObject_CAST(op)->state.interned;
199 }
200 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
201
202 /* For backward compatibility */
PyUnicode_IS_READY(PyObject * Py_UNUSED (op))203 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
204 return 1;
205 }
206 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
207
208 /* Return true if the string contains only ASCII characters, or 0 if not. The
209 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
210 ready. */
PyUnicode_IS_ASCII(PyObject * op)211 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
212 return _PyASCIIObject_CAST(op)->state.ascii;
213 }
214 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
215
216 /* Return true if the string is compact or 0 if not.
217 No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT(PyObject * op)218 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
219 return _PyASCIIObject_CAST(op)->state.compact;
220 }
221 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
222
223 /* Return true if the string is a compact ASCII string (use PyASCIIObject
224 structure), or 0 if not. No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT_ASCII(PyObject * op)225 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
226 return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
227 }
228 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
229
230 enum PyUnicode_Kind {
231 /* Return values of the PyUnicode_KIND() function: */
232 PyUnicode_1BYTE_KIND = 1,
233 PyUnicode_2BYTE_KIND = 2,
234 PyUnicode_4BYTE_KIND = 4
235 };
236
237 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
238 //
239 // gh-89653: Converting this macro to a static inline function would introduce
240 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
241 // unsigned numbers) where kind type is an int or on
242 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
243 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
244
245 /* Return a void pointer to the raw unicode buffer. */
_PyUnicode_COMPACT_DATA(PyObject * op)246 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
247 if (PyUnicode_IS_ASCII(op)) {
248 return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
249 }
250 return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
251 }
252
_PyUnicode_NONCOMPACT_DATA(PyObject * op)253 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
254 void *data;
255 assert(!PyUnicode_IS_COMPACT(op));
256 data = _PyUnicodeObject_CAST(op)->data.any;
257 assert(data != NULL);
258 return data;
259 }
260
PyUnicode_DATA(PyObject * op)261 static inline void* PyUnicode_DATA(PyObject *op) {
262 if (PyUnicode_IS_COMPACT(op)) {
263 return _PyUnicode_COMPACT_DATA(op);
264 }
265 return _PyUnicode_NONCOMPACT_DATA(op);
266 }
267 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
268
269 /* Return pointers to the canonical representation cast to unsigned char,
270 Py_UCS2, or Py_UCS4 for direct character access.
271 No checks are performed, use PyUnicode_KIND() before to ensure
272 these will work correctly. */
273
274 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
275 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
276 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
277
278 /* Returns the length of the unicode string. */
PyUnicode_GET_LENGTH(PyObject * op)279 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
280 return _PyASCIIObject_CAST(op)->length;
281 }
282 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
283
284 /* Write into the canonical representation, this function does not do any sanity
285 checks and is intended for usage in loops. The caller should cache the
286 kind and data pointers obtained from other function calls.
287 index is the index in the string (starts at 0) and value is the new
288 code point value which should be written to that location. */
PyUnicode_WRITE(int kind,void * data,Py_ssize_t index,Py_UCS4 value)289 static inline void PyUnicode_WRITE(int kind, void *data,
290 Py_ssize_t index, Py_UCS4 value)
291 {
292 assert(index >= 0);
293 if (kind == PyUnicode_1BYTE_KIND) {
294 assert(value <= 0xffU);
295 _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
296 }
297 else if (kind == PyUnicode_2BYTE_KIND) {
298 assert(value <= 0xffffU);
299 _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
300 }
301 else {
302 assert(kind == PyUnicode_4BYTE_KIND);
303 assert(value <= 0x10ffffU);
304 _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
305 }
306 }
307 #define PyUnicode_WRITE(kind, data, index, value) \
308 PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
309 (index), _Py_STATIC_CAST(Py_UCS4, value))
310
311 /* Read a code point from the string's canonical representation. No checks
312 or ready calls are performed. */
PyUnicode_READ(int kind,const void * data,Py_ssize_t index)313 static inline Py_UCS4 PyUnicode_READ(int kind,
314 const void *data, Py_ssize_t index)
315 {
316 assert(index >= 0);
317 if (kind == PyUnicode_1BYTE_KIND) {
318 return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
319 }
320 if (kind == PyUnicode_2BYTE_KIND) {
321 return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
322 }
323 assert(kind == PyUnicode_4BYTE_KIND);
324 return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
325 }
326 #define PyUnicode_READ(kind, data, index) \
327 PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
328 _Py_STATIC_CAST(const void*, data), \
329 (index))
330
331 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
332 calls PyUnicode_KIND() and might call it twice. For single reads, use
333 PyUnicode_READ_CHAR, for multiple consecutive reads callers should
334 cache kind and use PyUnicode_READ instead. */
PyUnicode_READ_CHAR(PyObject * unicode,Py_ssize_t index)335 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
336 {
337 int kind;
338
339 assert(index >= 0);
340 // Tolerate reading the NUL character at str[len(str)]
341 assert(index <= PyUnicode_GET_LENGTH(unicode));
342
343 kind = PyUnicode_KIND(unicode);
344 if (kind == PyUnicode_1BYTE_KIND) {
345 return PyUnicode_1BYTE_DATA(unicode)[index];
346 }
347 if (kind == PyUnicode_2BYTE_KIND) {
348 return PyUnicode_2BYTE_DATA(unicode)[index];
349 }
350 assert(kind == PyUnicode_4BYTE_KIND);
351 return PyUnicode_4BYTE_DATA(unicode)[index];
352 }
353 #define PyUnicode_READ_CHAR(unicode, index) \
354 PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
355
356 /* Return a maximum character value which is suitable for creating another
357 string based on op. This is always an approximation but more efficient
358 than iterating over the string. */
PyUnicode_MAX_CHAR_VALUE(PyObject * op)359 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
360 {
361 int kind;
362
363 if (PyUnicode_IS_ASCII(op)) {
364 return 0x7fU;
365 }
366
367 kind = PyUnicode_KIND(op);
368 if (kind == PyUnicode_1BYTE_KIND) {
369 return 0xffU;
370 }
371 if (kind == PyUnicode_2BYTE_KIND) {
372 return 0xffffU;
373 }
374 assert(kind == PyUnicode_4BYTE_KIND);
375 return 0x10ffffU;
376 }
377 #define PyUnicode_MAX_CHAR_VALUE(op) \
378 PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
379
380
381 /* === Public API ========================================================= */
382
383 /* With PEP 393, this is the recommended way to allocate a new unicode object.
384 This function will allocate the object and its buffer in a single memory
385 block. Objects created using this function are not resizable. */
386 PyAPI_FUNC(PyObject*) PyUnicode_New(
387 Py_ssize_t size, /* Number of code points in the new string */
388 Py_UCS4 maxchar /* maximum code point value in the string */
389 );
390
391 /* For backward compatibility */
PyUnicode_READY(PyObject * Py_UNUSED (op))392 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
393 {
394 return 0;
395 }
396 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
397
398 /* Copy character from one unicode object into another, this function performs
399 character conversion when necessary and falls back to memcpy() if possible.
400
401 Fail if to is too small (smaller than *how_many* or smaller than
402 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
403 kind(to), or if *to* has more than 1 reference.
404
405 Return the number of written character, or return -1 and raise an exception
406 on error.
407
408 Pseudo-code:
409
410 how_many = min(how_many, len(from) - from_start)
411 to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
412 return how_many
413
414 Note: The function doesn't write a terminating null character.
415 */
416 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
417 PyObject *to,
418 Py_ssize_t to_start,
419 PyObject *from,
420 Py_ssize_t from_start,
421 Py_ssize_t how_many
422 );
423
424 /* Fill a string with a character: write fill_char into
425 unicode[start:start+length].
426
427 Fail if fill_char is bigger than the string maximum character, or if the
428 string has more than 1 reference.
429
430 Return the number of written character, or return -1 and raise an exception
431 on error. */
432 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
433 PyObject *unicode,
434 Py_ssize_t start,
435 Py_ssize_t length,
436 Py_UCS4 fill_char
437 );
438
439 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
440 Scan the string to find the maximum character. */
441 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
442 int kind,
443 const void *buffer,
444 Py_ssize_t size);
445
446
447 /* --- _PyUnicodeWriter API ----------------------------------------------- */
448
449 typedef struct {
450 PyObject *buffer;
451 void *data;
452 int kind;
453 Py_UCS4 maxchar;
454 Py_ssize_t size;
455 Py_ssize_t pos;
456
457 /* minimum number of allocated characters (default: 0) */
458 Py_ssize_t min_length;
459
460 /* minimum character (default: 127, ASCII) */
461 Py_UCS4 min_char;
462
463 /* If non-zero, overallocate the buffer (default: 0). */
464 unsigned char overallocate;
465
466 /* If readonly is 1, buffer is a shared string (cannot be modified)
467 and size is set to 0. */
468 unsigned char readonly;
469 } _PyUnicodeWriter ;
470
471 // Initialize a Unicode writer.
472 //
473 // By default, the minimum buffer size is 0 character and overallocation is
474 // disabled. Set min_length, min_char and overallocate attributes to control
475 // the allocation of the buffer.
476 PyAPI_FUNC(void)
477 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
478
479 /* Prepare the buffer to write 'length' characters
480 with the specified maximum character.
481
482 Return 0 on success, raise an exception and return -1 on error. */
483 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \
484 (((MAXCHAR) <= (WRITER)->maxchar \
485 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \
486 ? 0 \
487 : (((LENGTH) == 0) \
488 ? 0 \
489 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
490
491 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
492 instead. */
493 PyAPI_FUNC(int)
494 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
495 Py_ssize_t length, Py_UCS4 maxchar);
496
497 /* Prepare the buffer to have at least the kind KIND.
498 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
499 support characters in range U+000-U+FFFF.
500
501 Return 0 on success, raise an exception and return -1 on error. */
502 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \
503 ((KIND) <= (WRITER)->kind \
504 ? 0 \
505 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
506
507 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
508 macro instead. */
509 PyAPI_FUNC(int)
510 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
511 int kind);
512
513 /* Append a Unicode character.
514 Return 0 on success, raise an exception and return -1 on error. */
515 PyAPI_FUNC(int)
516 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
517 Py_UCS4 ch
518 );
519
520 /* Append a Unicode string.
521 Return 0 on success, raise an exception and return -1 on error. */
522 PyAPI_FUNC(int)
523 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
524 PyObject *str /* Unicode string */
525 );
526
527 /* Append a substring of a Unicode string.
528 Return 0 on success, raise an exception and return -1 on error. */
529 PyAPI_FUNC(int)
530 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
531 PyObject *str, /* Unicode string */
532 Py_ssize_t start,
533 Py_ssize_t end
534 );
535
536 /* Append an ASCII-encoded byte string.
537 Return 0 on success, raise an exception and return -1 on error. */
538 PyAPI_FUNC(int)
539 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
540 const char *str, /* ASCII-encoded byte string */
541 Py_ssize_t len /* number of bytes, or -1 if unknown */
542 );
543
544 /* Append a latin1-encoded byte string.
545 Return 0 on success, raise an exception and return -1 on error. */
546 PyAPI_FUNC(int)
547 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
548 const char *str, /* latin1-encoded byte string */
549 Py_ssize_t len /* length in bytes */
550 );
551
552 /* Get the value of the writer as a Unicode string. Clear the
553 buffer of the writer. Raise an exception and return NULL
554 on error. */
555 PyAPI_FUNC(PyObject *)
556 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
557
558 /* Deallocate memory of a writer (clear its internal buffer). */
559 PyAPI_FUNC(void)
560 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
561
562
563 /* --- Manage the default encoding ---------------------------------------- */
564
565 /* Returns a pointer to the default encoding (UTF-8) of the
566 Unicode object unicode.
567
568 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
569 in the unicodeobject.
570
571 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
572 support the previous internal function with the same behaviour.
573
574 Use of this API is DEPRECATED since no size information can be
575 extracted from the returned data.
576 */
577
578 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
579
580 // Alias kept for backward compatibility
581 #define _PyUnicode_AsString PyUnicode_AsUTF8
582
583
584 /* === Characters Type APIs =============================================== */
585
586 /* These should not be used directly. Use the Py_UNICODE_IS* and
587 Py_UNICODE_TO* macros instead.
588
589 These APIs are implemented in Objects/unicodectype.c.
590
591 */
592
593 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
594 Py_UCS4 ch /* Unicode character */
595 );
596
597 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
598 Py_UCS4 ch /* Unicode character */
599 );
600
601 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
602 Py_UCS4 ch /* Unicode character */
603 );
604
605 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
606 const Py_UCS4 ch /* Unicode character */
607 );
608
609 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
610 const Py_UCS4 ch /* Unicode character */
611 );
612
613 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
614 Py_UCS4 ch /* Unicode character */
615 );
616
617 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
618 Py_UCS4 ch /* Unicode character */
619 );
620
621 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
622 Py_UCS4 ch /* Unicode character */
623 );
624
625 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
626 Py_UCS4 ch /* Unicode character */
627 );
628
629 PyAPI_FUNC(int) _PyUnicode_ToDigit(
630 Py_UCS4 ch /* Unicode character */
631 );
632
633 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
634 Py_UCS4 ch /* Unicode character */
635 );
636
637 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
638 Py_UCS4 ch /* Unicode character */
639 );
640
641 PyAPI_FUNC(int) _PyUnicode_IsDigit(
642 Py_UCS4 ch /* Unicode character */
643 );
644
645 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
646 Py_UCS4 ch /* Unicode character */
647 );
648
649 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
650 Py_UCS4 ch /* Unicode character */
651 );
652
653 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
654 Py_UCS4 ch /* Unicode character */
655 );
656
657 // Helper array used by Py_UNICODE_ISSPACE().
658 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
659
660 // Since splitting on whitespace is an important use case, and
661 // whitespace in most situations is solely ASCII whitespace, we
662 // optimize for the common case by using a quick look-up table
663 // _Py_ascii_whitespace (see below) with an inlined check.
Py_UNICODE_ISSPACE(Py_UCS4 ch)664 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
665 if (ch < 128) {
666 return _Py_ascii_whitespace[ch];
667 }
668 return _PyUnicode_IsWhitespace(ch);
669 }
670
671 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
672 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
673 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
674 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
675
676 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
677 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
678 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
679
680 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
681 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
682 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
683 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
684
685 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
686 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
687 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
688
689 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
690
Py_UNICODE_ISALNUM(Py_UCS4 ch)691 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
692 return (Py_UNICODE_ISALPHA(ch)
693 || Py_UNICODE_ISDECIMAL(ch)
694 || Py_UNICODE_ISDIGIT(ch)
695 || Py_UNICODE_ISNUMERIC(ch));
696 }
697
698
699 /* === Misc functions ===================================================== */
700
701 // Return an interned Unicode object for an Identifier; may fail if there is no
702 // memory.
703 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
704