• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 /*
5 
6 Unicode implementation based on original code by Fredrik Lundh,
7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8 Unicode Integration Proposal. (See
9 http://www.egenix.com/files/python/unicode-proposal.txt).
10 
11 Copyright (c) Corporation for National Research Initiatives.
12 
13 
14  Original header:
15  --------------------------------------------------------------------
16 
17  * Yet another Unicode string type for Python.  This type supports the
18  * 16-bit Basic Multilingual Plane (BMP) only.
19  *
20  * Written by Fredrik Lundh, January 1999.
21  *
22  * Copyright (c) 1999 by Secret Labs AB.
23  * Copyright (c) 1999 by Fredrik Lundh.
24  *
25  * fredrik@pythonware.com
26  * http://www.pythonware.com
27  *
28  * --------------------------------------------------------------------
29  * This Unicode String Type is
30  *
31  * Copyright (c) 1999 by Secret Labs AB
32  * Copyright (c) 1999 by Fredrik Lundh
33  *
34  * By obtaining, using, and/or copying this software and/or its
35  * associated documentation, you agree that you have read, understood,
36  * and will comply with the following terms and conditions:
37  *
38  * Permission to use, copy, modify, and distribute this software and its
39  * associated documentation for any purpose and without fee is hereby
40  * granted, provided that the above copyright notice appears in all
41  * copies, and that both that copyright notice and this permission notice
42  * appear in supporting documentation, and that the name of Secret Labs
43  * AB or the author not be used in advertising or publicity pertaining to
44  * distribution of the software without specific, written prior
45  * permission.
46  *
47  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
48  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
49  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
50  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
51  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
52  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
53  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
54  * -------------------------------------------------------------------- */
55 
56 /* === Internal API ======================================================= */
57 
58 /* --- Internal Unicode Format -------------------------------------------- */
59 
60 /* Python 3.x requires unicode */
61 #define Py_USING_UNICODE
62 
63 #ifndef SIZEOF_WCHAR_T
64 #error Must define SIZEOF_WCHAR_T
65 #endif
66 
67 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
68 
69 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
70    Otherwise, Unicode strings are stored as UCS-2 (with limited support
71    for UTF-16) */
72 
73 #if Py_UNICODE_SIZE >= 4
74 #define Py_UNICODE_WIDE
75 #endif
76 
77 /* Set these flags if the platform has "wchar.h" and the
78    wchar_t type is a 16-bit unsigned type */
79 /* #define HAVE_WCHAR_H */
80 /* #define HAVE_USABLE_WCHAR_T */
81 
82 /* If the compiler provides a wchar_t type we try to support it
83    through the interface functions PyUnicode_FromWideChar(),
84    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
85 
86 #ifdef HAVE_USABLE_WCHAR_T
87 # ifndef HAVE_WCHAR_H
88 #  define HAVE_WCHAR_H
89 # endif
90 #endif
91 
92 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
93    unicode representations. */
94 typedef uint32_t Py_UCS4;
95 typedef uint16_t Py_UCS2;
96 typedef uint8_t Py_UCS1;
97 
98 #ifdef __cplusplus
99 extern "C" {
100 #endif
101 
102 
103 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
104 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
105 
106 #define PyUnicode_Check(op) \
107     PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
108 #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
109 
110 /* --- Constants ---------------------------------------------------------- */
111 
112 /* This Unicode character will be used as replacement character during
113    decoding if the errors argument is set to "replace". Note: the
114    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
115    Unicode 3.0. */
116 
117 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
118 
119 /* === Public API ========================================================= */
120 
121 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
122 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
123     const char *u,             /* UTF-8 encoded string */
124     Py_ssize_t size            /* size of buffer */
125     );
126 
127 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
128    UTF-8 encoded bytes.  The size is determined with strlen(). */
129 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
130     const char *u              /* UTF-8 encoded string */
131     );
132 
133 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
134 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
135     PyObject *str,
136     Py_ssize_t start,
137     Py_ssize_t end);
138 #endif
139 
140 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
141 /* Copy the string into a UCS4 buffer including the null character if copy_null
142    is set. Return NULL and raise an exception on error. Raise a SystemError if
143    the buffer is smaller than the string. Return buffer on success.
144 
145    buflen is the length of the buffer in (Py_UCS4) characters. */
146 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
147     PyObject *unicode,
148     Py_UCS4* buffer,
149     Py_ssize_t buflen,
150     int copy_null);
151 
152 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
153  * PyMem_Malloc; if this fails, NULL is returned with a memory error
154    exception set. */
155 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
156 #endif
157 
158 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
159 /* Get the length of the Unicode object. */
160 
161 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
162     PyObject *unicode
163 );
164 #endif
165 
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Read a character from the string. */
168 
169 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
170     PyObject *unicode,
171     Py_ssize_t index
172     );
173 
174 /* Write a character to the string. The string must have been created through
175    PyUnicode_New, must not be shared, and must not have been hashed yet.
176 
177    Return 0 on success, -1 on error. */
178 
179 PyAPI_FUNC(int) PyUnicode_WriteChar(
180     PyObject *unicode,
181     Py_ssize_t index,
182     Py_UCS4 character
183     );
184 #endif
185 
186 /* Resize a Unicode object. The length is the number of codepoints.
187 
188    *unicode is modified to point to the new (resized) object and 0
189    returned on success.
190 
191    Try to resize the string in place (which is usually faster than allocating
192    a new string and copy characters), or create a new string.
193 
194    Error handling is implemented as follows: an exception is set, -1
195    is returned and *unicode left untouched.
196 
197    WARNING: The function doesn't check string content, the result may not be a
198             string in canonical representation. */
199 
200 PyAPI_FUNC(int) PyUnicode_Resize(
201     PyObject **unicode,         /* Pointer to the Unicode object */
202     Py_ssize_t length           /* New length */
203     );
204 
205 /* Decode obj to a Unicode object.
206 
207    bytes, bytearray and other bytes-like objects are decoded according to the
208    given encoding and error handler. The encoding and error handler can be
209    NULL to have the interface use UTF-8 and "strict".
210 
211    All other objects (including Unicode objects) raise an exception.
212 
213    The API returns NULL in case of an error. The caller is responsible
214    for decref'ing the returned objects.
215 
216 */
217 
218 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
219     PyObject *obj,              /* Object */
220     const char *encoding,       /* encoding */
221     const char *errors          /* error handling */
222     );
223 
224 /* Copy an instance of a Unicode subtype to a new true Unicode object if
225    necessary. If obj is already a true Unicode object (not a subtype), return
226    the reference with *incremented* refcount.
227 
228    The API returns NULL in case of an error. The caller is responsible
229    for decref'ing the returned objects.
230 
231 */
232 
233 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
234     PyObject *obj      /* Object */
235     );
236 
237 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
238     const char *format,   /* ASCII-encoded string  */
239     va_list vargs
240     );
241 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
242     const char *format,   /* ASCII-encoded string  */
243     ...
244     );
245 
246 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
247 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
248     const char *u              /* UTF-8 encoded string */
249     );
250 
251 /* --- wchar_t support for platforms which support it --------------------- */
252 
253 #ifdef HAVE_WCHAR_H
254 
255 /* Create a Unicode Object from the wchar_t buffer w of the given
256    size.
257 
258    The buffer is copied into the new object. */
259 
260 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
261     const wchar_t *w,           /* wchar_t buffer */
262     Py_ssize_t size             /* size of buffer */
263     );
264 
265 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
266    most size wchar_t characters are copied.
267 
268    Note that the resulting wchar_t string may or may not be
269    0-terminated.  It is the responsibility of the caller to make sure
270    that the wchar_t string is 0-terminated in case this is required by
271    the application.
272 
273    Returns the number of wchar_t characters copied (excluding a
274    possibly trailing 0-termination character) or -1 in case of an
275    error. */
276 
277 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
278     PyObject *unicode,          /* Unicode object */
279     wchar_t *w,                 /* wchar_t buffer */
280     Py_ssize_t size             /* size of buffer */
281     );
282 
283 /* Convert the Unicode object to a wide character string. The output string
284    always ends with a nul character. If size is not NULL, write the number of
285    wide characters (excluding the null character) into *size.
286 
287    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
288    on success. On error, returns NULL, *size is undefined and raises a
289    MemoryError. */
290 
291 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
292     PyObject *unicode,          /* Unicode object */
293     Py_ssize_t *size            /* number of characters of the result */
294     );
295 
296 #endif
297 
298 /* --- Unicode ordinals --------------------------------------------------- */
299 
300 /* Create a Unicode Object from the given Unicode code point ordinal.
301 
302    The ordinal must be in range(0x110000). A ValueError is
303    raised in case it is not.
304 
305 */
306 
307 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
308 
309 /* === Builtin Codecs =====================================================
310 
311    Many of these APIs take two arguments encoding and errors. These
312    parameters encoding and errors have the same semantics as the ones
313    of the builtin str() API.
314 
315    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
316 
317    Error handling is set by errors which may also be set to NULL
318    meaning to use the default handling defined for the codec. Default
319    error handling for all builtin codecs is "strict" (ValueErrors are
320    raised).
321 
322    The codecs all use a similar interface. Only deviation from the
323    generic ones are documented.
324 
325 */
326 
327 /* --- Manage the default encoding ---------------------------------------- */
328 
329 /* Returns "utf-8".  */
330 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
331 
332 /* --- Generic Codecs ----------------------------------------------------- */
333 
334 /* Create a Unicode object by decoding the encoded string s of the
335    given size. */
336 
337 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
338     const char *s,              /* encoded string */
339     Py_ssize_t size,            /* size of buffer */
340     const char *encoding,       /* encoding */
341     const char *errors          /* error handling */
342     );
343 
344 /* Decode a Unicode object unicode and return the result as Python
345    object.
346 
347    This API is DEPRECATED. The only supported standard encoding is rot13.
348    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
349    that decode from str. */
350 
351 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
352     PyObject *unicode,          /* Unicode object */
353     const char *encoding,       /* encoding */
354     const char *errors          /* error handling */
355     );
356 
357 /* Decode a Unicode object unicode and return the result as Unicode
358    object.
359 
360    This API is DEPRECATED. The only supported standard encoding is rot13.
361    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
362    that decode from str to str. */
363 
364 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
365     PyObject *unicode,          /* Unicode object */
366     const char *encoding,       /* encoding */
367     const char *errors          /* error handling */
368     );
369 
370 /* Encodes a Unicode object and returns the result as Python
371    object.
372 
373    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
374    since all standard encodings (except rot13) encode str to bytes.
375    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
376    that encode form str to non-bytes. */
377 
378 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
379     PyObject *unicode,          /* Unicode object */
380     const char *encoding,       /* encoding */
381     const char *errors          /* error handling */
382     );
383 
384 /* Encodes a Unicode object and returns the result as Python string
385    object. */
386 
387 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
388     PyObject *unicode,          /* Unicode object */
389     const char *encoding,       /* encoding */
390     const char *errors          /* error handling */
391     );
392 
393 /* Encodes a Unicode object and returns the result as Unicode
394    object.
395 
396    This API is DEPRECATED.  The only supported standard encodings is rot13.
397    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
398    that encode from str to str. */
399 
400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
401     PyObject *unicode,          /* Unicode object */
402     const char *encoding,       /* encoding */
403     const char *errors          /* error handling */
404     );
405 
406 /* Build an encoding map. */
407 
408 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
409     PyObject* string            /* 256 character map */
410    );
411 
412 /* --- UTF-7 Codecs ------------------------------------------------------- */
413 
414 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
415     const char *string,         /* UTF-7 encoded string */
416     Py_ssize_t length,          /* size of string */
417     const char *errors          /* error handling */
418     );
419 
420 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
421     const char *string,         /* UTF-7 encoded string */
422     Py_ssize_t length,          /* size of string */
423     const char *errors,         /* error handling */
424     Py_ssize_t *consumed        /* bytes consumed */
425     );
426 
427 /* --- UTF-8 Codecs ------------------------------------------------------- */
428 
429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
430     const char *string,         /* UTF-8 encoded string */
431     Py_ssize_t length,          /* size of string */
432     const char *errors          /* error handling */
433     );
434 
435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
436     const char *string,         /* UTF-8 encoded string */
437     Py_ssize_t length,          /* size of string */
438     const char *errors,         /* error handling */
439     Py_ssize_t *consumed        /* bytes consumed */
440     );
441 
442 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
443     PyObject *unicode           /* Unicode object */
444     );
445 
446 /* Returns a pointer to the default encoding (UTF-8) of the
447    Unicode object unicode and the size of the encoded representation
448    in bytes stored in *size.
449 
450    In case of an error, no *size is set.
451 
452    This function caches the UTF-8 encoded string in the unicodeobject
453    and subsequent calls will return the same string.  The memory is released
454    when the unicodeobject is deallocated.
455 */
456 
457 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
458 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
459     PyObject *unicode,
460     Py_ssize_t *size);
461 #endif
462 
463 /* --- UTF-32 Codecs ------------------------------------------------------ */
464 
465 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
466    the corresponding Unicode object.
467 
468    errors (if non-NULL) defines the error handling. It defaults
469    to "strict".
470 
471    If byteorder is non-NULL, the decoder starts decoding using the
472    given byte order:
473 
474     *byteorder == -1: little endian
475     *byteorder == 0:  native order
476     *byteorder == 1:  big endian
477 
478    In native mode, the first four bytes of the stream are checked for a
479    BOM mark. If found, the BOM mark is analysed, the byte order
480    adjusted and the BOM skipped.  In the other modes, no BOM mark
481    interpretation is done. After completion, *byteorder is set to the
482    current byte order at the end of input data.
483 
484    If byteorder is NULL, the codec starts in native order mode.
485 
486 */
487 
488 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
489     const char *string,         /* UTF-32 encoded string */
490     Py_ssize_t length,          /* size of string */
491     const char *errors,         /* error handling */
492     int *byteorder              /* pointer to byteorder to use
493                                    0=native;-1=LE,1=BE; updated on
494                                    exit */
495     );
496 
497 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
498     const char *string,         /* UTF-32 encoded string */
499     Py_ssize_t length,          /* size of string */
500     const char *errors,         /* error handling */
501     int *byteorder,             /* pointer to byteorder to use
502                                    0=native;-1=LE,1=BE; updated on
503                                    exit */
504     Py_ssize_t *consumed        /* bytes consumed */
505     );
506 
507 /* Returns a Python string using the UTF-32 encoding in native byte
508    order. The string always starts with a BOM mark.  */
509 
510 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
511     PyObject *unicode           /* Unicode object */
512     );
513 
514 /* Returns a Python string object holding the UTF-32 encoded value of
515    the Unicode data.
516 
517    If byteorder is not 0, output is written according to the following
518    byte order:
519 
520    byteorder == -1: little endian
521    byteorder == 0:  native byte order (writes a BOM mark)
522    byteorder == 1:  big endian
523 
524    If byteorder is 0, the output string will always start with the
525    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
526    prepended.
527 
528 */
529 
530 /* --- UTF-16 Codecs ------------------------------------------------------ */
531 
532 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
533    the corresponding Unicode object.
534 
535    errors (if non-NULL) defines the error handling. It defaults
536    to "strict".
537 
538    If byteorder is non-NULL, the decoder starts decoding using the
539    given byte order:
540 
541     *byteorder == -1: little endian
542     *byteorder == 0:  native order
543     *byteorder == 1:  big endian
544 
545    In native mode, the first two bytes of the stream are checked for a
546    BOM mark. If found, the BOM mark is analysed, the byte order
547    adjusted and the BOM skipped.  In the other modes, no BOM mark
548    interpretation is done. After completion, *byteorder is set to the
549    current byte order at the end of input data.
550 
551    If byteorder is NULL, the codec starts in native order mode.
552 
553 */
554 
555 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
556     const char *string,         /* UTF-16 encoded string */
557     Py_ssize_t length,          /* size of string */
558     const char *errors,         /* error handling */
559     int *byteorder              /* pointer to byteorder to use
560                                    0=native;-1=LE,1=BE; updated on
561                                    exit */
562     );
563 
564 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
565     const char *string,         /* UTF-16 encoded string */
566     Py_ssize_t length,          /* size of string */
567     const char *errors,         /* error handling */
568     int *byteorder,             /* pointer to byteorder to use
569                                    0=native;-1=LE,1=BE; updated on
570                                    exit */
571     Py_ssize_t *consumed        /* bytes consumed */
572     );
573 
574 /* Returns a Python string using the UTF-16 encoding in native byte
575    order. The string always starts with a BOM mark.  */
576 
577 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
578     PyObject *unicode           /* Unicode object */
579     );
580 
581 /* --- Unicode-Escape Codecs ---------------------------------------------- */
582 
583 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
584     const char *string,         /* Unicode-Escape encoded string */
585     Py_ssize_t length,          /* size of string */
586     const char *errors          /* error handling */
587     );
588 
589 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
590     PyObject *unicode           /* Unicode object */
591     );
592 
593 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
594 
595 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
596     const char *string,         /* Raw-Unicode-Escape encoded string */
597     Py_ssize_t length,          /* size of string */
598     const char *errors          /* error handling */
599     );
600 
601 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
602     PyObject *unicode           /* Unicode object */
603     );
604 
605 /* --- Latin-1 Codecs -----------------------------------------------------
606 
607    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
608 
609 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
610     const char *string,         /* Latin-1 encoded string */
611     Py_ssize_t length,          /* size of string */
612     const char *errors          /* error handling */
613     );
614 
615 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
616     PyObject *unicode           /* Unicode object */
617     );
618 
619 /* --- ASCII Codecs -------------------------------------------------------
620 
621    Only 7-bit ASCII data is expected. All other codes generate errors.
622 
623 */
624 
625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
626     const char *string,         /* ASCII encoded string */
627     Py_ssize_t length,          /* size of string */
628     const char *errors          /* error handling */
629     );
630 
631 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
632     PyObject *unicode           /* Unicode object */
633     );
634 
635 /* --- Character Map Codecs -----------------------------------------------
636 
637    This codec uses mappings to encode and decode characters.
638 
639    Decoding mappings must map byte ordinals (integers in the range from 0 to
640    255) to Unicode strings, integers (which are then interpreted as Unicode
641    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
642    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
643    mapping" and cause an error.
644 
645    Encoding mappings must map Unicode ordinal integers to bytes objects,
646    integers in the range from 0 to 255 or None.  Unmapped character
647    ordinals (ones which cause a LookupError) as well as mapped to
648    None are treated as "undefined mapping" and cause an error.
649 
650 */
651 
652 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
653     const char *string,         /* Encoded string */
654     Py_ssize_t length,          /* size of string */
655     PyObject *mapping,          /* decoding mapping */
656     const char *errors          /* error handling */
657     );
658 
659 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
660     PyObject *unicode,          /* Unicode object */
661     PyObject *mapping           /* encoding mapping */
662     );
663 
664 /* --- MBCS codecs for Windows -------------------------------------------- */
665 
666 #ifdef MS_WINDOWS
667 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
668     const char *string,         /* MBCS encoded string */
669     Py_ssize_t length,          /* size of string */
670     const char *errors          /* error handling */
671     );
672 
673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
674     const char *string,         /* MBCS encoded string */
675     Py_ssize_t length,          /* size of string */
676     const char *errors,         /* error handling */
677     Py_ssize_t *consumed        /* bytes consumed */
678     );
679 
680 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
681 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
682     int code_page,              /* code page number */
683     const char *string,         /* encoded string */
684     Py_ssize_t length,          /* size of string */
685     const char *errors,         /* error handling */
686     Py_ssize_t *consumed        /* bytes consumed */
687     );
688 #endif
689 
690 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
691     PyObject *unicode           /* Unicode object */
692     );
693 
694 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
695 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
696     int code_page,              /* code page number */
697     PyObject *unicode,          /* Unicode object */
698     const char *errors          /* error handling */
699     );
700 #endif
701 
702 #endif /* MS_WINDOWS */
703 
704 /* --- Locale encoding --------------------------------------------------- */
705 
706 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
707 /* Decode a string from the current locale encoding. The decoder is strict if
708    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
709    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
710    be decoded as a surrogate character and *surrogateescape* is not equal to
711    zero, the byte sequence is escaped using the 'surrogateescape' error handler
712    instead of being decoded. *str* must end with a null character but cannot
713    contain embedded null characters. */
714 
715 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
716     const char *str,
717     Py_ssize_t len,
718     const char *errors);
719 
720 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
721    length using strlen(). */
722 
723 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
724     const char *str,
725     const char *errors);
726 
727 /* Encode a Unicode object to the current locale encoding. The encoder is
728    strict is *surrogateescape* is equal to zero, otherwise the
729    "surrogateescape" error handler is used. Return a bytes object. The string
730    cannot contain embedded null characters. */
731 
732 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
733     PyObject *unicode,
734     const char *errors
735     );
736 #endif
737 
738 /* --- File system encoding ---------------------------------------------- */
739 
740 /* ParseTuple converter: encode str objects to bytes using
741    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
742 
743 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
744 
745 /* ParseTuple converter: decode bytes objects to unicode using
746    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
747 
748 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
749 
750 /* Decode a null-terminated string from the Python filesystem encoding
751    and error handler.
752 
753    If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */
754 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
755     const char *s               /* encoded string */
756     );
757 
758 /* Decode a string from the Python filesystem encoding and error handler. */
759 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
760     const char *s,               /* encoded string */
761     Py_ssize_t size              /* size */
762     );
763 
764 /* Encode a Unicode object to the Python filesystem encoding and error handler.
765    Return bytes. */
766 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
767     PyObject *unicode
768     );
769 
770 /* --- Methods & Slots ----------------------------------------------------
771 
772    These are capable of handling Unicode objects and strings on input
773    (we refer to them as strings in the descriptions) and return
774    Unicode objects or integers as appropriate. */
775 
776 /* Concat two strings giving a new Unicode string. */
777 
778 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
779     PyObject *left,             /* Left string */
780     PyObject *right             /* Right string */
781     );
782 
783 /* Concat two strings and put the result in *pleft
784    (sets *pleft to NULL on error) */
785 
786 PyAPI_FUNC(void) PyUnicode_Append(
787     PyObject **pleft,           /* Pointer to left string */
788     PyObject *right             /* Right string */
789     );
790 
791 /* Concat two strings, put the result in *pleft and drop the right object
792    (sets *pleft to NULL on error) */
793 
794 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
795     PyObject **pleft,           /* Pointer to left string */
796     PyObject *right             /* Right string */
797     );
798 
799 /* Split a string giving a list of Unicode strings.
800 
801    If sep is NULL, splitting will be done at all whitespace
802    substrings. Otherwise, splits occur at the given separator.
803 
804    At most maxsplit splits will be done. If negative, no limit is set.
805 
806    Separators are not included in the resulting list.
807 
808 */
809 
810 PyAPI_FUNC(PyObject*) PyUnicode_Split(
811     PyObject *s,                /* String to split */
812     PyObject *sep,              /* String separator */
813     Py_ssize_t maxsplit         /* Maxsplit count */
814     );
815 
816 /* Dito, but split at line breaks.
817 
818    CRLF is considered to be one line break. Line breaks are not
819    included in the resulting list. */
820 
821 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
822     PyObject *s,                /* String to split */
823     int keepends                /* If true, line end markers are included */
824     );
825 
826 /* Partition a string using a given separator. */
827 
828 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
829     PyObject *s,                /* String to partition */
830     PyObject *sep               /* String separator */
831     );
832 
833 /* Partition a string using a given separator, searching from the end of the
834    string. */
835 
836 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
837     PyObject *s,                /* String to partition */
838     PyObject *sep               /* String separator */
839     );
840 
841 /* Split a string giving a list of Unicode strings.
842 
843    If sep is NULL, splitting will be done at all whitespace
844    substrings. Otherwise, splits occur at the given separator.
845 
846    At most maxsplit splits will be done. But unlike PyUnicode_Split
847    PyUnicode_RSplit splits from the end of the string. If negative,
848    no limit is set.
849 
850    Separators are not included in the resulting list.
851 
852 */
853 
854 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
855     PyObject *s,                /* String to split */
856     PyObject *sep,              /* String separator */
857     Py_ssize_t maxsplit         /* Maxsplit count */
858     );
859 
860 /* Translate a string by applying a character mapping table to it and
861    return the resulting Unicode object.
862 
863    The mapping table must map Unicode ordinal integers to Unicode strings,
864    Unicode ordinal integers or None (causing deletion of the character).
865 
866    Mapping tables may be dictionaries or sequences. Unmapped character
867    ordinals (ones which cause a LookupError) are left untouched and
868    are copied as-is.
869 
870 */
871 
872 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
873     PyObject *str,              /* String */
874     PyObject *table,            /* Translate table */
875     const char *errors          /* error handling */
876     );
877 
878 /* Join a sequence of strings using the given separator and return
879    the resulting Unicode string. */
880 
881 PyAPI_FUNC(PyObject*) PyUnicode_Join(
882     PyObject *separator,        /* Separator string */
883     PyObject *seq               /* Sequence object */
884     );
885 
886 /* Return 1 if substr matches str[start:end] at the given tail end, 0
887    otherwise. */
888 
889 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
890     PyObject *str,              /* String */
891     PyObject *substr,           /* Prefix or Suffix string */
892     Py_ssize_t start,           /* Start index */
893     Py_ssize_t end,             /* Stop index */
894     int direction               /* Tail end: -1 prefix, +1 suffix */
895     );
896 
897 /* Return the first position of substr in str[start:end] using the
898    given search direction or -1 if not found. -2 is returned in case
899    an error occurred and an exception is set. */
900 
901 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
902     PyObject *str,              /* String */
903     PyObject *substr,           /* Substring to find */
904     Py_ssize_t start,           /* Start index */
905     Py_ssize_t end,             /* Stop index */
906     int direction               /* Find direction: +1 forward, -1 backward */
907     );
908 
909 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
910 /* Like PyUnicode_Find, but search for single character only. */
911 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
912     PyObject *str,
913     Py_UCS4 ch,
914     Py_ssize_t start,
915     Py_ssize_t end,
916     int direction
917     );
918 #endif
919 
920 /* Count the number of occurrences of substr in str[start:end]. */
921 
922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
923     PyObject *str,              /* String */
924     PyObject *substr,           /* Substring to count */
925     Py_ssize_t start,           /* Start index */
926     Py_ssize_t end              /* Stop index */
927     );
928 
929 /* Replace at most maxcount occurrences of substr in str with replstr
930    and return the resulting Unicode object. */
931 
932 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
933     PyObject *str,              /* String */
934     PyObject *substr,           /* Substring to find */
935     PyObject *replstr,          /* Substring to replace */
936     Py_ssize_t maxcount         /* Max. number of replacements to apply;
937                                    -1 = all */
938     );
939 
940 /* Compare two strings and return -1, 0, 1 for less than, equal,
941    greater than resp.
942    Raise an exception and return -1 on error. */
943 
944 PyAPI_FUNC(int) PyUnicode_Compare(
945     PyObject *left,             /* Left string */
946     PyObject *right             /* Right string */
947     );
948 
949 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
950    equal, and greater than, respectively.  It is best to pass only
951    ASCII-encoded strings, but the function interprets the input string as
952    ISO-8859-1 if it contains non-ASCII characters.
953    This function does not raise exceptions. */
954 
955 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
956     PyObject *left,
957     const char *right           /* ASCII-encoded string */
958     );
959 
960 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
961 /* Compare a Unicode object with UTF-8 encoded C string.
962    Return 1 if they are equal, or 0 otherwise.
963    This function does not raise exceptions. */
964 
965 PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
966 PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
967 #endif
968 
969 /* Rich compare two strings and return one of the following:
970 
971    - NULL in case an exception was raised
972    - Py_True or Py_False for successful comparisons
973    - Py_NotImplemented in case the type combination is unknown
974 
975    Possible values for op:
976 
977      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
978 
979 */
980 
981 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
982     PyObject *left,             /* Left string */
983     PyObject *right,            /* Right string */
984     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
985     );
986 
987 /* Apply an argument tuple or dictionary to a format string and return
988    the resulting Unicode string. */
989 
990 PyAPI_FUNC(PyObject *) PyUnicode_Format(
991     PyObject *format,           /* Format string */
992     PyObject *args              /* Argument tuple or dictionary */
993     );
994 
995 /* Checks whether element is contained in container and return 1/0
996    accordingly.
997 
998    element has to coerce to a one element Unicode string. -1 is
999    returned in case of an error. */
1000 
1001 PyAPI_FUNC(int) PyUnicode_Contains(
1002     PyObject *container,        /* Container string */
1003     PyObject *element           /* Element string */
1004     );
1005 
1006 /* Checks whether argument is a valid identifier. */
1007 
1008 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1009 
1010 /* === Characters Type APIs =============================================== */
1011 
1012 #ifndef Py_LIMITED_API
1013 #  define Py_CPYTHON_UNICODEOBJECT_H
1014 #  include "cpython/unicodeobject.h"
1015 #  undef Py_CPYTHON_UNICODEOBJECT_H
1016 #endif
1017 
1018 #ifdef __cplusplus
1019 }
1020 #endif
1021 #endif /* !Py_UNICODEOBJECT_H */
1022