• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef Py_CPYTHON_UNICODEOBJECT_H
2 #  error "this header file must not be included directly"
3 #endif
4 
5 /* Py_UNICODE was the native Unicode storage format (code unit) used by
6    Python and represents a single Unicode element in the Unicode type.
7    With PEP 393, Py_UNICODE is deprecated and replaced with a
8    typedef to wchar_t. */
9 Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
10 Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
11 
12 
13 /* --- Internal Unicode Operations ---------------------------------------- */
14 
15 // Static inline functions to work with surrogates
Py_UNICODE_IS_SURROGATE(Py_UCS4 ch)16 static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
17     return (0xD800 <= ch && ch <= 0xDFFF);
18 }
Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch)19 static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
20     return (0xD800 <= ch && ch <= 0xDBFF);
21 }
Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch)22 static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
23     return (0xDC00 <= ch && ch <= 0xDFFF);
24 }
25 
26 // Join two surrogate characters and return a single Py_UCS4 value.
Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high,Py_UCS4 low)27 static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
28     assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
29     assert(Py_UNICODE_IS_LOW_SURROGATE(low));
30     return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
31 }
32 
33 // High surrogate = top 10 bits added to 0xD800.
34 // The character must be in the range [U+10000; U+10ffff].
Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch)35 static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
36     assert(0x10000 <= ch && ch <= 0x10ffff);
37     return (0xD800 - (0x10000 >> 10) + (ch >> 10));
38 }
39 
40 // Low surrogate = bottom 10 bits added to 0xDC00.
41 // The character must be in the range [U+10000; U+10ffff].
Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch)42 static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
43     assert(0x10000 <= ch && ch <= 0x10ffff);
44     return (0xDC00 + (ch & 0x3FF));
45 }
46 
47 
48 /* --- Unicode Type ------------------------------------------------------- */
49 
50 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
51    structure. state.ascii and state.compact are set, and the data
52    immediately follow the structure. utf8_length can be found
53    in the length field; the utf8 pointer is equal to the data pointer. */
54 typedef struct {
55     /* There are 4 forms of Unicode strings:
56 
57        - compact ascii:
58 
59          * structure = PyASCIIObject
60          * test: PyUnicode_IS_COMPACT_ASCII(op)
61          * kind = PyUnicode_1BYTE_KIND
62          * compact = 1
63          * ascii = 1
64          * (length is the length of the utf8)
65          * (data starts just after the structure)
66          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
67 
68        - compact:
69 
70          * structure = PyCompactUnicodeObject
71          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
72          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
73            PyUnicode_4BYTE_KIND
74          * compact = 1
75          * ascii = 0
76          * utf8 is not shared with data
77          * utf8_length = 0 if utf8 is NULL
78          * (data starts just after the structure)
79 
80        - legacy string:
81 
82          * structure = PyUnicodeObject structure
83          * test: !PyUnicode_IS_COMPACT(op)
84          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
85            PyUnicode_4BYTE_KIND
86          * compact = 0
87          * data.any is not NULL
88          * utf8 is shared and utf8_length = length with data.any if ascii = 1
89          * utf8_length = 0 if utf8 is NULL
90 
91        Compact strings use only one memory block (structure + characters),
92        whereas legacy strings use one block for the structure and one block
93        for characters.
94 
95        Legacy strings are created by subclasses of Unicode.
96 
97        See also _PyUnicode_CheckConsistency().
98     */
99     PyObject_HEAD
100     Py_ssize_t length;          /* Number of code points in the string */
101     Py_hash_t hash;             /* Hash value; -1 if not set */
102     struct {
103         /* If interned is non-zero, the two references from the
104            dictionary to this object are *not* counted in ob_refcnt.
105            The possible values here are:
106                0: Not Interned
107                1: Interned
108                2: Interned and Immortal
109                3: Interned, Immortal, and Static
110            This categorization allows the runtime to determine the right
111            cleanup mechanism at runtime shutdown. */
112         unsigned int interned:2;
113         /* Character size:
114 
115            - PyUnicode_1BYTE_KIND (1):
116 
117              * character type = Py_UCS1 (8 bits, unsigned)
118              * all characters are in the range U+0000-U+00FF (latin1)
119              * if ascii is set, all characters are in the range U+0000-U+007F
120                (ASCII), otherwise at least one character is in the range
121                U+0080-U+00FF
122 
123            - PyUnicode_2BYTE_KIND (2):
124 
125              * character type = Py_UCS2 (16 bits, unsigned)
126              * all characters are in the range U+0000-U+FFFF (BMP)
127              * at least one character is in the range U+0100-U+FFFF
128 
129            - PyUnicode_4BYTE_KIND (4):
130 
131              * character type = Py_UCS4 (32 bits, unsigned)
132              * all characters are in the range U+0000-U+10FFFF
133              * at least one character is in the range U+10000-U+10FFFF
134          */
135         unsigned int kind:3;
136         /* Compact is with respect to the allocation scheme. Compact unicode
137            objects only require one memory block while non-compact objects use
138            one block for the PyUnicodeObject struct and another for its data
139            buffer. */
140         unsigned int compact:1;
141         /* The string only contains characters in the range U+0000-U+007F (ASCII)
142            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
143            set, use the PyASCIIObject structure. */
144         unsigned int ascii:1;
145         /* The object is statically allocated. */
146         unsigned int statically_allocated:1;
147         /* Padding to ensure that PyUnicode_DATA() is always aligned to
148            4 bytes (see issue #19537 on m68k). */
149         unsigned int :24;
150     } state;
151 } PyASCIIObject;
152 
153 /* Non-ASCII strings allocated through PyUnicode_New use the
154    PyCompactUnicodeObject structure. state.compact is set, and the data
155    immediately follow the structure. */
156 typedef struct {
157     PyASCIIObject _base;
158     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
159                                  * terminating \0. */
160     char *utf8;                 /* UTF-8 representation (null-terminated) */
161 } PyCompactUnicodeObject;
162 
163 /* Object format for Unicode subclasses. */
164 typedef struct {
165     PyCompactUnicodeObject _base;
166     union {
167         void *any;
168         Py_UCS1 *latin1;
169         Py_UCS2 *ucs2;
170         Py_UCS4 *ucs4;
171     } data;                     /* Canonical, smallest-form Unicode buffer */
172 } PyUnicodeObject;
173 
174 
175 #define _PyASCIIObject_CAST(op) \
176     (assert(PyUnicode_Check(op)), \
177      _Py_CAST(PyASCIIObject*, (op)))
178 #define _PyCompactUnicodeObject_CAST(op) \
179     (assert(PyUnicode_Check(op)), \
180      _Py_CAST(PyCompactUnicodeObject*, (op)))
181 #define _PyUnicodeObject_CAST(op) \
182     (assert(PyUnicode_Check(op)), \
183      _Py_CAST(PyUnicodeObject*, (op)))
184 
185 
186 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
187 
188 /* Values for PyASCIIObject.state: */
189 
190 /* Interning state. */
191 #define SSTATE_NOT_INTERNED 0
192 #define SSTATE_INTERNED_MORTAL 1
193 #define SSTATE_INTERNED_IMMORTAL 2
194 #define SSTATE_INTERNED_IMMORTAL_STATIC 3
195 
196 /* Use only if you know it's a string */
PyUnicode_CHECK_INTERNED(PyObject * op)197 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
198     return _PyASCIIObject_CAST(op)->state.interned;
199 }
200 #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
201 
202 /* For backward compatibility */
PyUnicode_IS_READY(PyObject * Py_UNUSED (op))203 static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
204     return 1;
205 }
206 #define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
207 
208 /* Return true if the string contains only ASCII characters, or 0 if not. The
209    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
210    ready. */
PyUnicode_IS_ASCII(PyObject * op)211 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
212     return _PyASCIIObject_CAST(op)->state.ascii;
213 }
214 #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
215 
216 /* Return true if the string is compact or 0 if not.
217    No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT(PyObject * op)218 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
219     return _PyASCIIObject_CAST(op)->state.compact;
220 }
221 #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
222 
223 /* Return true if the string is a compact ASCII string (use PyASCIIObject
224    structure), or 0 if not.  No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT_ASCII(PyObject * op)225 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
226     return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
227 }
228 #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
229 
230 enum PyUnicode_Kind {
231 /* Return values of the PyUnicode_KIND() function: */
232     PyUnicode_1BYTE_KIND = 1,
233     PyUnicode_2BYTE_KIND = 2,
234     PyUnicode_4BYTE_KIND = 4
235 };
236 
237 // PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
238 //
239 // gh-89653: Converting this macro to a static inline function would introduce
240 // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
241 // unsigned numbers) where kind type is an int or on
242 // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
243 #define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
244 
245 /* Return a void pointer to the raw unicode buffer. */
_PyUnicode_COMPACT_DATA(PyObject * op)246 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
247     if (PyUnicode_IS_ASCII(op)) {
248         return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
249     }
250     return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
251 }
252 
_PyUnicode_NONCOMPACT_DATA(PyObject * op)253 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
254     void *data;
255     assert(!PyUnicode_IS_COMPACT(op));
256     data = _PyUnicodeObject_CAST(op)->data.any;
257     assert(data != NULL);
258     return data;
259 }
260 
PyUnicode_DATA(PyObject * op)261 static inline void* PyUnicode_DATA(PyObject *op) {
262     if (PyUnicode_IS_COMPACT(op)) {
263         return _PyUnicode_COMPACT_DATA(op);
264     }
265     return _PyUnicode_NONCOMPACT_DATA(op);
266 }
267 #define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
268 
269 /* Return pointers to the canonical representation cast to unsigned char,
270    Py_UCS2, or Py_UCS4 for direct character access.
271    No checks are performed, use PyUnicode_KIND() before to ensure
272    these will work correctly. */
273 
274 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
275 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
276 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
277 
278 /* Returns the length of the unicode string. */
PyUnicode_GET_LENGTH(PyObject * op)279 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
280     return _PyASCIIObject_CAST(op)->length;
281 }
282 #define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
283 
284 /* Write into the canonical representation, this function does not do any sanity
285    checks and is intended for usage in loops.  The caller should cache the
286    kind and data pointers obtained from other function calls.
287    index is the index in the string (starts at 0) and value is the new
288    code point value which should be written to that location. */
PyUnicode_WRITE(int kind,void * data,Py_ssize_t index,Py_UCS4 value)289 static inline void PyUnicode_WRITE(int kind, void *data,
290                                    Py_ssize_t index, Py_UCS4 value)
291 {
292     assert(index >= 0);
293     if (kind == PyUnicode_1BYTE_KIND) {
294         assert(value <= 0xffU);
295         _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
296     }
297     else if (kind == PyUnicode_2BYTE_KIND) {
298         assert(value <= 0xffffU);
299         _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
300     }
301     else {
302         assert(kind == PyUnicode_4BYTE_KIND);
303         assert(value <= 0x10ffffU);
304         _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
305     }
306 }
307 #define PyUnicode_WRITE(kind, data, index, value) \
308     PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
309                     (index), _Py_STATIC_CAST(Py_UCS4, value))
310 
311 /* Read a code point from the string's canonical representation.  No checks
312    or ready calls are performed. */
PyUnicode_READ(int kind,const void * data,Py_ssize_t index)313 static inline Py_UCS4 PyUnicode_READ(int kind,
314                                      const void *data, Py_ssize_t index)
315 {
316     assert(index >= 0);
317     if (kind == PyUnicode_1BYTE_KIND) {
318         return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
319     }
320     if (kind == PyUnicode_2BYTE_KIND) {
321         return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
322     }
323     assert(kind == PyUnicode_4BYTE_KIND);
324     return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
325 }
326 #define PyUnicode_READ(kind, data, index) \
327     PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
328                    _Py_STATIC_CAST(const void*, data), \
329                    (index))
330 
331 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
332    calls PyUnicode_KIND() and might call it twice.  For single reads, use
333    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
334    cache kind and use PyUnicode_READ instead. */
PyUnicode_READ_CHAR(PyObject * unicode,Py_ssize_t index)335 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
336 {
337     int kind;
338 
339     assert(index >= 0);
340     // Tolerate reading the NUL character at str[len(str)]
341     assert(index <= PyUnicode_GET_LENGTH(unicode));
342 
343     kind = PyUnicode_KIND(unicode);
344     if (kind == PyUnicode_1BYTE_KIND) {
345         return PyUnicode_1BYTE_DATA(unicode)[index];
346     }
347     if (kind == PyUnicode_2BYTE_KIND) {
348         return PyUnicode_2BYTE_DATA(unicode)[index];
349     }
350     assert(kind == PyUnicode_4BYTE_KIND);
351     return PyUnicode_4BYTE_DATA(unicode)[index];
352 }
353 #define PyUnicode_READ_CHAR(unicode, index) \
354     PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
355 
356 /* Return a maximum character value which is suitable for creating another
357    string based on op.  This is always an approximation but more efficient
358    than iterating over the string. */
PyUnicode_MAX_CHAR_VALUE(PyObject * op)359 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
360 {
361     int kind;
362 
363     if (PyUnicode_IS_ASCII(op)) {
364         return 0x7fU;
365     }
366 
367     kind = PyUnicode_KIND(op);
368     if (kind == PyUnicode_1BYTE_KIND) {
369        return 0xffU;
370     }
371     if (kind == PyUnicode_2BYTE_KIND) {
372         return 0xffffU;
373     }
374     assert(kind == PyUnicode_4BYTE_KIND);
375     return 0x10ffffU;
376 }
377 #define PyUnicode_MAX_CHAR_VALUE(op) \
378     PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
379 
380 
381 /* === Public API ========================================================= */
382 
383 /* With PEP 393, this is the recommended way to allocate a new unicode object.
384    This function will allocate the object and its buffer in a single memory
385    block.  Objects created using this function are not resizable. */
386 PyAPI_FUNC(PyObject*) PyUnicode_New(
387     Py_ssize_t size,            /* Number of code points in the new string */
388     Py_UCS4 maxchar             /* maximum code point value in the string */
389     );
390 
391 /* For backward compatibility */
PyUnicode_READY(PyObject * Py_UNUSED (op))392 static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
393 {
394     return 0;
395 }
396 #define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
397 
398 /* Copy character from one unicode object into another, this function performs
399    character conversion when necessary and falls back to memcpy() if possible.
400 
401    Fail if to is too small (smaller than *how_many* or smaller than
402    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
403    kind(to), or if *to* has more than 1 reference.
404 
405    Return the number of written character, or return -1 and raise an exception
406    on error.
407 
408    Pseudo-code:
409 
410        how_many = min(how_many, len(from) - from_start)
411        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
412        return how_many
413 
414    Note: The function doesn't write a terminating null character.
415    */
416 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
417     PyObject *to,
418     Py_ssize_t to_start,
419     PyObject *from,
420     Py_ssize_t from_start,
421     Py_ssize_t how_many
422     );
423 
424 /* Fill a string with a character: write fill_char into
425    unicode[start:start+length].
426 
427    Fail if fill_char is bigger than the string maximum character, or if the
428    string has more than 1 reference.
429 
430    Return the number of written character, or return -1 and raise an exception
431    on error. */
432 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
433     PyObject *unicode,
434     Py_ssize_t start,
435     Py_ssize_t length,
436     Py_UCS4 fill_char
437     );
438 
439 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
440    Scan the string to find the maximum character. */
441 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
442     int kind,
443     const void *buffer,
444     Py_ssize_t size);
445 
446 
447 /* --- _PyUnicodeWriter API ----------------------------------------------- */
448 
449 typedef struct {
450     PyObject *buffer;
451     void *data;
452     int kind;
453     Py_UCS4 maxchar;
454     Py_ssize_t size;
455     Py_ssize_t pos;
456 
457     /* minimum number of allocated characters (default: 0) */
458     Py_ssize_t min_length;
459 
460     /* minimum character (default: 127, ASCII) */
461     Py_UCS4 min_char;
462 
463     /* If non-zero, overallocate the buffer (default: 0). */
464     unsigned char overallocate;
465 
466     /* If readonly is 1, buffer is a shared string (cannot be modified)
467        and size is set to 0. */
468     unsigned char readonly;
469 } _PyUnicodeWriter ;
470 
471 // Initialize a Unicode writer.
472 //
473 // By default, the minimum buffer size is 0 character and overallocation is
474 // disabled. Set min_length, min_char and overallocate attributes to control
475 // the allocation of the buffer.
476 PyAPI_FUNC(void)
477 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
478 
479 /* Prepare the buffer to write 'length' characters
480    with the specified maximum character.
481 
482    Return 0 on success, raise an exception and return -1 on error. */
483 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
484     (((MAXCHAR) <= (WRITER)->maxchar                                  \
485       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
486      ? 0                                                              \
487      : (((LENGTH) == 0)                                               \
488         ? 0                                                           \
489         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
490 
491 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
492    instead. */
493 PyAPI_FUNC(int)
494 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
495                                  Py_ssize_t length, Py_UCS4 maxchar);
496 
497 /* Prepare the buffer to have at least the kind KIND.
498    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
499    support characters in range U+000-U+FFFF.
500 
501    Return 0 on success, raise an exception and return -1 on error. */
502 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
503     ((KIND) <= (WRITER)->kind                                         \
504      ? 0                                                              \
505      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
506 
507 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
508    macro instead. */
509 PyAPI_FUNC(int)
510 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
511                                      int kind);
512 
513 /* Append a Unicode character.
514    Return 0 on success, raise an exception and return -1 on error. */
515 PyAPI_FUNC(int)
516 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
517     Py_UCS4 ch
518     );
519 
520 /* Append a Unicode string.
521    Return 0 on success, raise an exception and return -1 on error. */
522 PyAPI_FUNC(int)
523 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
524     PyObject *str               /* Unicode string */
525     );
526 
527 /* Append a substring of a Unicode string.
528    Return 0 on success, raise an exception and return -1 on error. */
529 PyAPI_FUNC(int)
530 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
531     PyObject *str,              /* Unicode string */
532     Py_ssize_t start,
533     Py_ssize_t end
534     );
535 
536 /* Append an ASCII-encoded byte string.
537    Return 0 on success, raise an exception and return -1 on error. */
538 PyAPI_FUNC(int)
539 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
540     const char *str,           /* ASCII-encoded byte string */
541     Py_ssize_t len             /* number of bytes, or -1 if unknown */
542     );
543 
544 /* Append a latin1-encoded byte string.
545    Return 0 on success, raise an exception and return -1 on error. */
546 PyAPI_FUNC(int)
547 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
548     const char *str,           /* latin1-encoded byte string */
549     Py_ssize_t len             /* length in bytes */
550     );
551 
552 /* Get the value of the writer as a Unicode string. Clear the
553    buffer of the writer. Raise an exception and return NULL
554    on error. */
555 PyAPI_FUNC(PyObject *)
556 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
557 
558 /* Deallocate memory of a writer (clear its internal buffer). */
559 PyAPI_FUNC(void)
560 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
561 
562 
563 /* --- Manage the default encoding ---------------------------------------- */
564 
565 /* Returns a pointer to the default encoding (UTF-8) of the
566    Unicode object unicode.
567 
568    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
569    in the unicodeobject.
570 
571    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
572    support the previous internal function with the same behaviour.
573 
574    Use of this API is DEPRECATED since no size information can be
575    extracted from the returned data.
576 */
577 
578 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
579 
580 // Alias kept for backward compatibility
581 #define _PyUnicode_AsString PyUnicode_AsUTF8
582 
583 
584 /* === Characters Type APIs =============================================== */
585 
586 /* These should not be used directly. Use the Py_UNICODE_IS* and
587    Py_UNICODE_TO* macros instead.
588 
589    These APIs are implemented in Objects/unicodectype.c.
590 
591 */
592 
593 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
594     Py_UCS4 ch       /* Unicode character */
595     );
596 
597 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
598     Py_UCS4 ch       /* Unicode character */
599     );
600 
601 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
602     Py_UCS4 ch       /* Unicode character */
603     );
604 
605 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
606     const Py_UCS4 ch         /* Unicode character */
607     );
608 
609 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
610     const Py_UCS4 ch         /* Unicode character */
611     );
612 
613 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
614     Py_UCS4 ch       /* Unicode character */
615     );
616 
617 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
618     Py_UCS4 ch       /* Unicode character */
619     );
620 
621 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
622     Py_UCS4 ch       /* Unicode character */
623     );
624 
625 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
626     Py_UCS4 ch       /* Unicode character */
627     );
628 
629 PyAPI_FUNC(int) _PyUnicode_ToDigit(
630     Py_UCS4 ch       /* Unicode character */
631     );
632 
633 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
634     Py_UCS4 ch       /* Unicode character */
635     );
636 
637 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
638     Py_UCS4 ch       /* Unicode character */
639     );
640 
641 PyAPI_FUNC(int) _PyUnicode_IsDigit(
642     Py_UCS4 ch       /* Unicode character */
643     );
644 
645 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
646     Py_UCS4 ch       /* Unicode character */
647     );
648 
649 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
650     Py_UCS4 ch       /* Unicode character */
651     );
652 
653 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
654     Py_UCS4 ch       /* Unicode character */
655     );
656 
657 // Helper array used by Py_UNICODE_ISSPACE().
658 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
659 
660 // Since splitting on whitespace is an important use case, and
661 // whitespace in most situations is solely ASCII whitespace, we
662 // optimize for the common case by using a quick look-up table
663 // _Py_ascii_whitespace (see below) with an inlined check.
Py_UNICODE_ISSPACE(Py_UCS4 ch)664 static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
665     if (ch < 128) {
666         return _Py_ascii_whitespace[ch];
667     }
668     return _PyUnicode_IsWhitespace(ch);
669 }
670 
671 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
672 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
673 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
674 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
675 
676 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
677 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
678 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
679 
680 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
681 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
682 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
683 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
684 
685 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
686 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
687 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
688 
689 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
690 
Py_UNICODE_ISALNUM(Py_UCS4 ch)691 static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
692    return (Py_UNICODE_ISALPHA(ch)
693            || Py_UNICODE_ISDECIMAL(ch)
694            || Py_UNICODE_ISDIGIT(ch)
695            || Py_UNICODE_ISNUMERIC(ch));
696 }
697 
698 
699 /* === Misc functions ===================================================== */
700 
701 // Return an interned Unicode object for an Identifier; may fail if there is no
702 // memory.
703 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
704