• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 #include <stdarg.h>
5 
6 /*
7 
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12 
13 Copyright (c) Corporation for National Research Initiatives.
14 
15 
16  Original header:
17  --------------------------------------------------------------------
18 
19  * Yet another Unicode string type for Python.  This type supports the
20  * 16-bit Basic Multilingual Plane (BMP) only.
21  *
22  * Written by Fredrik Lundh, January 1999.
23  *
24  * Copyright (c) 1999 by Secret Labs AB.
25  * Copyright (c) 1999 by Fredrik Lundh.
26  *
27  * fredrik@pythonware.com
28  * http://www.pythonware.com
29  *
30  * --------------------------------------------------------------------
31  * This Unicode String Type is
32  *
33  * Copyright (c) 1999 by Secret Labs AB
34  * Copyright (c) 1999 by Fredrik Lundh
35  *
36  * By obtaining, using, and/or copying this software and/or its
37  * associated documentation, you agree that you have read, understood,
38  * and will comply with the following terms and conditions:
39  *
40  * Permission to use, copy, modify, and distribute this software and its
41  * associated documentation for any purpose and without fee is hereby
42  * granted, provided that the above copyright notice appears in all
43  * copies, and that both that copyright notice and this permission notice
44  * appear in supporting documentation, and that the name of Secret Labs
45  * AB or the author not be used in advertising or publicity pertaining to
46  * distribution of the software without specific, written prior
47  * permission.
48  *
49  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56  * -------------------------------------------------------------------- */
57 
58 #include <ctype.h>
59 
60 /* === Internal API ======================================================= */
61 
62 /* --- Internal Unicode Format -------------------------------------------- */
63 
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66 
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70 
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72 
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74    Otherwise, Unicode strings are stored as UCS-2 (with limited support
75    for UTF-16) */
76 
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80 
81 /* Set these flags if the platform has "wchar.h" and the
82    wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85 
86 /* If the compiler provides a wchar_t type we try to support it
87    through the interface functions PyUnicode_FromWideChar(),
88    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89 
90 #ifdef HAVE_USABLE_WCHAR_T
91 # ifndef HAVE_WCHAR_H
92 #  define HAVE_WCHAR_H
93 # endif
94 #endif
95 
96 #ifdef HAVE_WCHAR_H
97 #  include <wchar.h>
98 #endif
99 
100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
101    unicode representations. */
102 typedef uint32_t Py_UCS4;
103 typedef uint16_t Py_UCS2;
104 typedef uint8_t Py_UCS1;
105 
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109 
110 
111 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113 
114 #define PyUnicode_Check(op) \
115                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117 
118 /* --- Constants ---------------------------------------------------------- */
119 
120 /* This Unicode character will be used as replacement character during
121    decoding if the errors argument is set to "replace". Note: the
122    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123    Unicode 3.0. */
124 
125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126 
127 /* === Public API ========================================================= */
128 
129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131     const char *u,             /* UTF-8 encoded string */
132     Py_ssize_t size            /* size of buffer */
133     );
134 
135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136    UTF-8 encoded bytes.  The size is determined with strlen(). */
137 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138     const char *u              /* UTF-8 encoded string */
139     );
140 
141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143     PyObject *str,
144     Py_ssize_t start,
145     Py_ssize_t end);
146 #endif
147 
148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149 /* Copy the string into a UCS4 buffer including the null character if copy_null
150    is set. Return NULL and raise an exception on error. Raise a SystemError if
151    the buffer is smaller than the string. Return buffer on success.
152 
153    buflen is the length of the buffer in (Py_UCS4) characters. */
154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155     PyObject *unicode,
156     Py_UCS4* buffer,
157     Py_ssize_t buflen,
158     int copy_null);
159 
160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
161  * PyMem_Malloc; if this fails, NULL is returned with a memory error
162    exception set. */
163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164 #endif
165 
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Get the length of the Unicode object. */
168 
169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170     PyObject *unicode
171 );
172 #endif
173 
174 /* Get the number of Py_UNICODE units in the
175    string representation. */
176 
177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178     PyObject *unicode           /* Unicode object */
179     );
180 
181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182 /* Read a character from the string. */
183 
184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185     PyObject *unicode,
186     Py_ssize_t index
187     );
188 
189 /* Write a character to the string. The string must have been created through
190    PyUnicode_New, must not be shared, and must not have been hashed yet.
191 
192    Return 0 on success, -1 on error. */
193 
194 PyAPI_FUNC(int) PyUnicode_WriteChar(
195     PyObject *unicode,
196     Py_ssize_t index,
197     Py_UCS4 character
198     );
199 #endif
200 
201 /* Resize a Unicode object. The length is the number of characters, except
202    if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203    is the number of Py_UNICODE characters.
204 
205    *unicode is modified to point to the new (resized) object and 0
206    returned on success.
207 
208    Try to resize the string in place (which is usually faster than allocating
209    a new string and copy characters), or create a new string.
210 
211    Error handling is implemented as follows: an exception is set, -1
212    is returned and *unicode left untouched.
213 
214    WARNING: The function doesn't check string content, the result may not be a
215             string in canonical representation. */
216 
217 PyAPI_FUNC(int) PyUnicode_Resize(
218     PyObject **unicode,         /* Pointer to the Unicode object */
219     Py_ssize_t length           /* New length */
220     );
221 
222 /* Decode obj to a Unicode object.
223 
224    bytes, bytearray and other bytes-like objects are decoded according to the
225    given encoding and error handler. The encoding and error handler can be
226    NULL to have the interface use UTF-8 and "strict".
227 
228    All other objects (including Unicode objects) raise an exception.
229 
230    The API returns NULL in case of an error. The caller is responsible
231    for decref'ing the returned objects.
232 
233 */
234 
235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236     PyObject *obj,              /* Object */
237     const char *encoding,       /* encoding */
238     const char *errors          /* error handling */
239     );
240 
241 /* Copy an instance of a Unicode subtype to a new true Unicode object if
242    necessary. If obj is already a true Unicode object (not a subtype), return
243    the reference with *incremented* refcount.
244 
245    The API returns NULL in case of an error. The caller is responsible
246    for decref'ing the returned objects.
247 
248 */
249 
250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251     PyObject *obj      /* Object */
252     );
253 
254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255     const char *format,   /* ASCII-encoded string  */
256     va_list vargs
257     );
258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259     const char *format,   /* ASCII-encoded string  */
260     ...
261     );
262 
263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
265 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266     const char *u              /* UTF-8 encoded string */
267     );
268 
269 /* Use only if you know it's a string */
270 #define PyUnicode_CHECK_INTERNED(op) \
271     (((PyASCIIObject *)(op))->state.interned)
272 
273 /* --- wchar_t support for platforms which support it --------------------- */
274 
275 #ifdef HAVE_WCHAR_H
276 
277 /* Create a Unicode Object from the wchar_t buffer w of the given
278    size.
279 
280    The buffer is copied into the new object. */
281 
282 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
283     const wchar_t *w,           /* wchar_t buffer */
284     Py_ssize_t size             /* size of buffer */
285     );
286 
287 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
288    most size wchar_t characters are copied.
289 
290    Note that the resulting wchar_t string may or may not be
291    0-terminated.  It is the responsibility of the caller to make sure
292    that the wchar_t string is 0-terminated in case this is required by
293    the application.
294 
295    Returns the number of wchar_t characters copied (excluding a
296    possibly trailing 0-termination character) or -1 in case of an
297    error. */
298 
299 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
300     PyObject *unicode,          /* Unicode object */
301     wchar_t *w,                 /* wchar_t buffer */
302     Py_ssize_t size             /* size of buffer */
303     );
304 
305 /* Convert the Unicode object to a wide character string. The output string
306    always ends with a nul character. If size is not NULL, write the number of
307    wide characters (excluding the null character) into *size.
308 
309    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
310    on success. On error, returns NULL, *size is undefined and raises a
311    MemoryError. */
312 
313 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
314     PyObject *unicode,          /* Unicode object */
315     Py_ssize_t *size            /* number of characters of the result */
316     );
317 
318 #endif
319 
320 /* --- Unicode ordinals --------------------------------------------------- */
321 
322 /* Create a Unicode Object from the given Unicode code point ordinal.
323 
324    The ordinal must be in range(0x110000). A ValueError is
325    raised in case it is not.
326 
327 */
328 
329 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
330 
331 /* === Builtin Codecs =====================================================
332 
333    Many of these APIs take two arguments encoding and errors. These
334    parameters encoding and errors have the same semantics as the ones
335    of the builtin str() API.
336 
337    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
338 
339    Error handling is set by errors which may also be set to NULL
340    meaning to use the default handling defined for the codec. Default
341    error handling for all builtin codecs is "strict" (ValueErrors are
342    raised).
343 
344    The codecs all use a similar interface. Only deviation from the
345    generic ones are documented.
346 
347 */
348 
349 /* --- Manage the default encoding ---------------------------------------- */
350 
351 /* Returns "utf-8".  */
352 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
353 
354 /* --- Generic Codecs ----------------------------------------------------- */
355 
356 /* Create a Unicode object by decoding the encoded string s of the
357    given size. */
358 
359 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
360     const char *s,              /* encoded string */
361     Py_ssize_t size,            /* size of buffer */
362     const char *encoding,       /* encoding */
363     const char *errors          /* error handling */
364     );
365 
366 /* Decode a Unicode object unicode and return the result as Python
367    object.
368 
369    This API is DEPRECATED. The only supported standard encoding is rot13.
370    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
371    that decode from str. */
372 
373 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
374     PyObject *unicode,          /* Unicode object */
375     const char *encoding,       /* encoding */
376     const char *errors          /* error handling */
377     );
378 
379 /* Decode a Unicode object unicode and return the result as Unicode
380    object.
381 
382    This API is DEPRECATED. The only supported standard encoding is rot13.
383    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
384    that decode from str to str. */
385 
386 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
387     PyObject *unicode,          /* Unicode object */
388     const char *encoding,       /* encoding */
389     const char *errors          /* error handling */
390     );
391 
392 /* Encodes a Unicode object and returns the result as Python
393    object.
394 
395    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
396    since all standard encodings (except rot13) encode str to bytes.
397    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
398    that encode form str to non-bytes. */
399 
400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
401     PyObject *unicode,          /* Unicode object */
402     const char *encoding,       /* encoding */
403     const char *errors          /* error handling */
404     );
405 
406 /* Encodes a Unicode object and returns the result as Python string
407    object. */
408 
409 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
410     PyObject *unicode,          /* Unicode object */
411     const char *encoding,       /* encoding */
412     const char *errors          /* error handling */
413     );
414 
415 /* Encodes a Unicode object and returns the result as Unicode
416    object.
417 
418    This API is DEPRECATED.  The only supported standard encodings is rot13.
419    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
420    that encode from str to str. */
421 
422 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
423     PyObject *unicode,          /* Unicode object */
424     const char *encoding,       /* encoding */
425     const char *errors          /* error handling */
426     );
427 
428 /* Build an encoding map. */
429 
430 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
431     PyObject* string            /* 256 character map */
432    );
433 
434 /* --- UTF-7 Codecs ------------------------------------------------------- */
435 
436 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
437     const char *string,         /* UTF-7 encoded string */
438     Py_ssize_t length,          /* size of string */
439     const char *errors          /* error handling */
440     );
441 
442 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
443     const char *string,         /* UTF-7 encoded string */
444     Py_ssize_t length,          /* size of string */
445     const char *errors,         /* error handling */
446     Py_ssize_t *consumed        /* bytes consumed */
447     );
448 
449 /* --- UTF-8 Codecs ------------------------------------------------------- */
450 
451 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
452     const char *string,         /* UTF-8 encoded string */
453     Py_ssize_t length,          /* size of string */
454     const char *errors          /* error handling */
455     );
456 
457 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
458     const char *string,         /* UTF-8 encoded string */
459     Py_ssize_t length,          /* size of string */
460     const char *errors,         /* error handling */
461     Py_ssize_t *consumed        /* bytes consumed */
462     );
463 
464 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
465     PyObject *unicode           /* Unicode object */
466     );
467 
468 /* --- UTF-32 Codecs ------------------------------------------------------ */
469 
470 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
471    the corresponding Unicode object.
472 
473    errors (if non-NULL) defines the error handling. It defaults
474    to "strict".
475 
476    If byteorder is non-NULL, the decoder starts decoding using the
477    given byte order:
478 
479     *byteorder == -1: little endian
480     *byteorder == 0:  native order
481     *byteorder == 1:  big endian
482 
483    In native mode, the first four bytes of the stream are checked for a
484    BOM mark. If found, the BOM mark is analysed, the byte order
485    adjusted and the BOM skipped.  In the other modes, no BOM mark
486    interpretation is done. After completion, *byteorder is set to the
487    current byte order at the end of input data.
488 
489    If byteorder is NULL, the codec starts in native order mode.
490 
491 */
492 
493 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
494     const char *string,         /* UTF-32 encoded string */
495     Py_ssize_t length,          /* size of string */
496     const char *errors,         /* error handling */
497     int *byteorder              /* pointer to byteorder to use
498                                    0=native;-1=LE,1=BE; updated on
499                                    exit */
500     );
501 
502 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
503     const char *string,         /* UTF-32 encoded string */
504     Py_ssize_t length,          /* size of string */
505     const char *errors,         /* error handling */
506     int *byteorder,             /* pointer to byteorder to use
507                                    0=native;-1=LE,1=BE; updated on
508                                    exit */
509     Py_ssize_t *consumed        /* bytes consumed */
510     );
511 
512 /* Returns a Python string using the UTF-32 encoding in native byte
513    order. The string always starts with a BOM mark.  */
514 
515 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
516     PyObject *unicode           /* Unicode object */
517     );
518 
519 /* Returns a Python string object holding the UTF-32 encoded value of
520    the Unicode data.
521 
522    If byteorder is not 0, output is written according to the following
523    byte order:
524 
525    byteorder == -1: little endian
526    byteorder == 0:  native byte order (writes a BOM mark)
527    byteorder == 1:  big endian
528 
529    If byteorder is 0, the output string will always start with the
530    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
531    prepended.
532 
533 */
534 
535 /* --- UTF-16 Codecs ------------------------------------------------------ */
536 
537 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
538    the corresponding Unicode object.
539 
540    errors (if non-NULL) defines the error handling. It defaults
541    to "strict".
542 
543    If byteorder is non-NULL, the decoder starts decoding using the
544    given byte order:
545 
546     *byteorder == -1: little endian
547     *byteorder == 0:  native order
548     *byteorder == 1:  big endian
549 
550    In native mode, the first two bytes of the stream are checked for a
551    BOM mark. If found, the BOM mark is analysed, the byte order
552    adjusted and the BOM skipped.  In the other modes, no BOM mark
553    interpretation is done. After completion, *byteorder is set to the
554    current byte order at the end of input data.
555 
556    If byteorder is NULL, the codec starts in native order mode.
557 
558 */
559 
560 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
561     const char *string,         /* UTF-16 encoded string */
562     Py_ssize_t length,          /* size of string */
563     const char *errors,         /* error handling */
564     int *byteorder              /* pointer to byteorder to use
565                                    0=native;-1=LE,1=BE; updated on
566                                    exit */
567     );
568 
569 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
570     const char *string,         /* UTF-16 encoded string */
571     Py_ssize_t length,          /* size of string */
572     const char *errors,         /* error handling */
573     int *byteorder,             /* pointer to byteorder to use
574                                    0=native;-1=LE,1=BE; updated on
575                                    exit */
576     Py_ssize_t *consumed        /* bytes consumed */
577     );
578 
579 /* Returns a Python string using the UTF-16 encoding in native byte
580    order. The string always starts with a BOM mark.  */
581 
582 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
583     PyObject *unicode           /* Unicode object */
584     );
585 
586 /* --- Unicode-Escape Codecs ---------------------------------------------- */
587 
588 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
589     const char *string,         /* Unicode-Escape encoded string */
590     Py_ssize_t length,          /* size of string */
591     const char *errors          /* error handling */
592     );
593 
594 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
595     PyObject *unicode           /* Unicode object */
596     );
597 
598 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
599 
600 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
601     const char *string,         /* Raw-Unicode-Escape encoded string */
602     Py_ssize_t length,          /* size of string */
603     const char *errors          /* error handling */
604     );
605 
606 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
607     PyObject *unicode           /* Unicode object */
608     );
609 
610 /* --- Latin-1 Codecs -----------------------------------------------------
611 
612    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
613 
614 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
615     const char *string,         /* Latin-1 encoded string */
616     Py_ssize_t length,          /* size of string */
617     const char *errors          /* error handling */
618     );
619 
620 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
621     PyObject *unicode           /* Unicode object */
622     );
623 
624 /* --- ASCII Codecs -------------------------------------------------------
625 
626    Only 7-bit ASCII data is excepted. All other codes generate errors.
627 
628 */
629 
630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
631     const char *string,         /* ASCII encoded string */
632     Py_ssize_t length,          /* size of string */
633     const char *errors          /* error handling */
634     );
635 
636 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
637     PyObject *unicode           /* Unicode object */
638     );
639 
640 /* --- Character Map Codecs -----------------------------------------------
641 
642    This codec uses mappings to encode and decode characters.
643 
644    Decoding mappings must map byte ordinals (integers in the range from 0 to
645    255) to Unicode strings, integers (which are then interpreted as Unicode
646    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
647    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
648    mapping" and cause an error.
649 
650    Encoding mappings must map Unicode ordinal integers to bytes objects,
651    integers in the range from 0 to 255 or None.  Unmapped character
652    ordinals (ones which cause a LookupError) as well as mapped to
653    None are treated as "undefined mapping" and cause an error.
654 
655 */
656 
657 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
658     const char *string,         /* Encoded string */
659     Py_ssize_t length,          /* size of string */
660     PyObject *mapping,          /* decoding mapping */
661     const char *errors          /* error handling */
662     );
663 
664 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
665     PyObject *unicode,          /* Unicode object */
666     PyObject *mapping           /* encoding mapping */
667     );
668 
669 /* --- MBCS codecs for Windows -------------------------------------------- */
670 
671 #ifdef MS_WINDOWS
672 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
673     const char *string,         /* MBCS encoded string */
674     Py_ssize_t length,          /* size of string */
675     const char *errors          /* error handling */
676     );
677 
678 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
679     const char *string,         /* MBCS encoded string */
680     Py_ssize_t length,          /* size of string */
681     const char *errors,         /* error handling */
682     Py_ssize_t *consumed        /* bytes consumed */
683     );
684 
685 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
687     int code_page,              /* code page number */
688     const char *string,         /* encoded string */
689     Py_ssize_t length,          /* size of string */
690     const char *errors,         /* error handling */
691     Py_ssize_t *consumed        /* bytes consumed */
692     );
693 #endif
694 
695 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
696     PyObject *unicode           /* Unicode object */
697     );
698 
699 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
700 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
701     int code_page,              /* code page number */
702     PyObject *unicode,          /* Unicode object */
703     const char *errors          /* error handling */
704     );
705 #endif
706 
707 #endif /* MS_WINDOWS */
708 
709 /* --- Locale encoding --------------------------------------------------- */
710 
711 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
712 /* Decode a string from the current locale encoding. The decoder is strict if
713    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
714    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
715    be decoded as a surrogate character and *surrogateescape* is not equal to
716    zero, the byte sequence is escaped using the 'surrogateescape' error handler
717    instead of being decoded. *str* must end with a null character but cannot
718    contain embedded null characters. */
719 
720 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
721     const char *str,
722     Py_ssize_t len,
723     const char *errors);
724 
725 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
726    length using strlen(). */
727 
728 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
729     const char *str,
730     const char *errors);
731 
732 /* Encode a Unicode object to the current locale encoding. The encoder is
733    strict is *surrogateescape* is equal to zero, otherwise the
734    "surrogateescape" error handler is used. Return a bytes object. The string
735    cannot contain embedded null characters. */
736 
737 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
738     PyObject *unicode,
739     const char *errors
740     );
741 #endif
742 
743 /* --- File system encoding ---------------------------------------------- */
744 
745 /* ParseTuple converter: encode str objects to bytes using
746    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
747 
748 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
749 
750 /* ParseTuple converter: decode bytes objects to unicode using
751    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
752 
753 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
754 
755 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
756    and the "surrogateescape" error handler.
757 
758    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
759    encoding.
760 
761    Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
762 */
763 
764 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
765     const char *s               /* encoded string */
766     );
767 
768 /* Decode a string using Py_FileSystemDefaultEncoding
769    and the "surrogateescape" error handler.
770 
771    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
772    encoding.
773 */
774 
775 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
776     const char *s,               /* encoded string */
777     Py_ssize_t size              /* size */
778     );
779 
780 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
781    "surrogateescape" error handler, and return bytes.
782 
783    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
784    encoding.
785 */
786 
787 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
788     PyObject *unicode
789     );
790 
791 /* --- Methods & Slots ----------------------------------------------------
792 
793    These are capable of handling Unicode objects and strings on input
794    (we refer to them as strings in the descriptions) and return
795    Unicode objects or integers as appropriate. */
796 
797 /* Concat two strings giving a new Unicode string. */
798 
799 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
800     PyObject *left,             /* Left string */
801     PyObject *right             /* Right string */
802     );
803 
804 /* Concat two strings and put the result in *pleft
805    (sets *pleft to NULL on error) */
806 
807 PyAPI_FUNC(void) PyUnicode_Append(
808     PyObject **pleft,           /* Pointer to left string */
809     PyObject *right             /* Right string */
810     );
811 
812 /* Concat two strings, put the result in *pleft and drop the right object
813    (sets *pleft to NULL on error) */
814 
815 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
816     PyObject **pleft,           /* Pointer to left string */
817     PyObject *right             /* Right string */
818     );
819 
820 /* Split a string giving a list of Unicode strings.
821 
822    If sep is NULL, splitting will be done at all whitespace
823    substrings. Otherwise, splits occur at the given separator.
824 
825    At most maxsplit splits will be done. If negative, no limit is set.
826 
827    Separators are not included in the resulting list.
828 
829 */
830 
831 PyAPI_FUNC(PyObject*) PyUnicode_Split(
832     PyObject *s,                /* String to split */
833     PyObject *sep,              /* String separator */
834     Py_ssize_t maxsplit         /* Maxsplit count */
835     );
836 
837 /* Dito, but split at line breaks.
838 
839    CRLF is considered to be one line break. Line breaks are not
840    included in the resulting list. */
841 
842 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
843     PyObject *s,                /* String to split */
844     int keepends                /* If true, line end markers are included */
845     );
846 
847 /* Partition a string using a given separator. */
848 
849 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
850     PyObject *s,                /* String to partition */
851     PyObject *sep               /* String separator */
852     );
853 
854 /* Partition a string using a given separator, searching from the end of the
855    string. */
856 
857 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
858     PyObject *s,                /* String to partition */
859     PyObject *sep               /* String separator */
860     );
861 
862 /* Split a string giving a list of Unicode strings.
863 
864    If sep is NULL, splitting will be done at all whitespace
865    substrings. Otherwise, splits occur at the given separator.
866 
867    At most maxsplit splits will be done. But unlike PyUnicode_Split
868    PyUnicode_RSplit splits from the end of the string. If negative,
869    no limit is set.
870 
871    Separators are not included in the resulting list.
872 
873 */
874 
875 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
876     PyObject *s,                /* String to split */
877     PyObject *sep,              /* String separator */
878     Py_ssize_t maxsplit         /* Maxsplit count */
879     );
880 
881 /* Translate a string by applying a character mapping table to it and
882    return the resulting Unicode object.
883 
884    The mapping table must map Unicode ordinal integers to Unicode strings,
885    Unicode ordinal integers or None (causing deletion of the character).
886 
887    Mapping tables may be dictionaries or sequences. Unmapped character
888    ordinals (ones which cause a LookupError) are left untouched and
889    are copied as-is.
890 
891 */
892 
893 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
894     PyObject *str,              /* String */
895     PyObject *table,            /* Translate table */
896     const char *errors          /* error handling */
897     );
898 
899 /* Join a sequence of strings using the given separator and return
900    the resulting Unicode string. */
901 
902 PyAPI_FUNC(PyObject*) PyUnicode_Join(
903     PyObject *separator,        /* Separator string */
904     PyObject *seq               /* Sequence object */
905     );
906 
907 /* Return 1 if substr matches str[start:end] at the given tail end, 0
908    otherwise. */
909 
910 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
911     PyObject *str,              /* String */
912     PyObject *substr,           /* Prefix or Suffix string */
913     Py_ssize_t start,           /* Start index */
914     Py_ssize_t end,             /* Stop index */
915     int direction               /* Tail end: -1 prefix, +1 suffix */
916     );
917 
918 /* Return the first position of substr in str[start:end] using the
919    given search direction or -1 if not found. -2 is returned in case
920    an error occurred and an exception is set. */
921 
922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
923     PyObject *str,              /* String */
924     PyObject *substr,           /* Substring to find */
925     Py_ssize_t start,           /* Start index */
926     Py_ssize_t end,             /* Stop index */
927     int direction               /* Find direction: +1 forward, -1 backward */
928     );
929 
930 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
931 /* Like PyUnicode_Find, but search for single character only. */
932 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
933     PyObject *str,
934     Py_UCS4 ch,
935     Py_ssize_t start,
936     Py_ssize_t end,
937     int direction
938     );
939 #endif
940 
941 /* Count the number of occurrences of substr in str[start:end]. */
942 
943 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
944     PyObject *str,              /* String */
945     PyObject *substr,           /* Substring to count */
946     Py_ssize_t start,           /* Start index */
947     Py_ssize_t end              /* Stop index */
948     );
949 
950 /* Replace at most maxcount occurrences of substr in str with replstr
951    and return the resulting Unicode object. */
952 
953 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
954     PyObject *str,              /* String */
955     PyObject *substr,           /* Substring to find */
956     PyObject *replstr,          /* Substring to replace */
957     Py_ssize_t maxcount         /* Max. number of replacements to apply;
958                                    -1 = all */
959     );
960 
961 /* Compare two strings and return -1, 0, 1 for less than, equal,
962    greater than resp.
963    Raise an exception and return -1 on error. */
964 
965 PyAPI_FUNC(int) PyUnicode_Compare(
966     PyObject *left,             /* Left string */
967     PyObject *right             /* Right string */
968     );
969 
970 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
971    equal, and greater than, respectively.  It is best to pass only
972    ASCII-encoded strings, but the function interprets the input string as
973    ISO-8859-1 if it contains non-ASCII characters.
974    This function does not raise exceptions. */
975 
976 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
977     PyObject *left,
978     const char *right           /* ASCII-encoded string */
979     );
980 
981 /* Rich compare two strings and return one of the following:
982 
983    - NULL in case an exception was raised
984    - Py_True or Py_False for successful comparisons
985    - Py_NotImplemented in case the type combination is unknown
986 
987    Possible values for op:
988 
989      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
990 
991 */
992 
993 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
994     PyObject *left,             /* Left string */
995     PyObject *right,            /* Right string */
996     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
997     );
998 
999 /* Apply an argument tuple or dictionary to a format string and return
1000    the resulting Unicode string. */
1001 
1002 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1003     PyObject *format,           /* Format string */
1004     PyObject *args              /* Argument tuple or dictionary */
1005     );
1006 
1007 /* Checks whether element is contained in container and return 1/0
1008    accordingly.
1009 
1010    element has to coerce to a one element Unicode string. -1 is
1011    returned in case of an error. */
1012 
1013 PyAPI_FUNC(int) PyUnicode_Contains(
1014     PyObject *container,        /* Container string */
1015     PyObject *element           /* Element string */
1016     );
1017 
1018 /* Checks whether argument is a valid identifier. */
1019 
1020 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1021 
1022 /* === Characters Type APIs =============================================== */
1023 
1024 #ifndef Py_LIMITED_API
1025 #  define Py_CPYTHON_UNICODEOBJECT_H
1026 #  include  "cpython/unicodeobject.h"
1027 #  undef Py_CPYTHON_UNICODEOBJECT_H
1028 #endif
1029 
1030 #ifdef __cplusplus
1031 }
1032 #endif
1033 #endif /* !Py_UNICODEOBJECT_H */
1034