• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 #include <stdarg.h>
5 
6 /*
7 
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12 
13 Copyright (c) Corporation for National Research Initiatives.
14 
15 
16  Original header:
17  --------------------------------------------------------------------
18 
19  * Yet another Unicode string type for Python.  This type supports the
20  * 16-bit Basic Multilingual Plane (BMP) only.
21  *
22  * Written by Fredrik Lundh, January 1999.
23  *
24  * Copyright (c) 1999 by Secret Labs AB.
25  * Copyright (c) 1999 by Fredrik Lundh.
26  *
27  * fredrik@pythonware.com
28  * http://www.pythonware.com
29  *
30  * --------------------------------------------------------------------
31  * This Unicode String Type is
32  *
33  * Copyright (c) 1999 by Secret Labs AB
34  * Copyright (c) 1999 by Fredrik Lundh
35  *
36  * By obtaining, using, and/or copying this software and/or its
37  * associated documentation, you agree that you have read, understood,
38  * and will comply with the following terms and conditions:
39  *
40  * Permission to use, copy, modify, and distribute this software and its
41  * associated documentation for any purpose and without fee is hereby
42  * granted, provided that the above copyright notice appears in all
43  * copies, and that both that copyright notice and this permission notice
44  * appear in supporting documentation, and that the name of Secret Labs
45  * AB or the author not be used in advertising or publicity pertaining to
46  * distribution of the software without specific, written prior
47  * permission.
48  *
49  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56  * -------------------------------------------------------------------- */
57 
58 #include <ctype.h>
59 
60 /* === Internal API ======================================================= */
61 
62 /* --- Internal Unicode Format -------------------------------------------- */
63 
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66 
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70 
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72 
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74    Otherwise, Unicode strings are stored as UCS-2 (with limited support
75    for UTF-16) */
76 
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80 
81 /* Set these flags if the platform has "wchar.h" and the
82    wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85 
86 /* If the compiler provides a wchar_t type we try to support it
87    through the interface functions PyUnicode_FromWideChar(),
88    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89 
90 #ifdef HAVE_USABLE_WCHAR_T
91 # ifndef HAVE_WCHAR_H
92 #  define HAVE_WCHAR_H
93 # endif
94 #endif
95 
96 #ifdef HAVE_WCHAR_H
97 #  include <wchar.h>
98 #endif
99 
100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
101    unicode representations. */
102 typedef uint32_t Py_UCS4;
103 typedef uint16_t Py_UCS2;
104 typedef uint8_t Py_UCS1;
105 
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109 
110 
111 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113 
114 #define PyUnicode_Check(op) \
115                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117 
118 /* --- Constants ---------------------------------------------------------- */
119 
120 /* This Unicode character will be used as replacement character during
121    decoding if the errors argument is set to "replace". Note: the
122    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123    Unicode 3.0. */
124 
125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126 
127 /* === Public API ========================================================= */
128 
129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131     const char *u,             /* UTF-8 encoded string */
132     Py_ssize_t size            /* size of buffer */
133     );
134 
135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136    UTF-8 encoded bytes.  The size is determined with strlen(). */
137 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138     const char *u              /* UTF-8 encoded string */
139     );
140 
141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143     PyObject *str,
144     Py_ssize_t start,
145     Py_ssize_t end);
146 #endif
147 
148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149 /* Copy the string into a UCS4 buffer including the null character if copy_null
150    is set. Return NULL and raise an exception on error. Raise a SystemError if
151    the buffer is smaller than the string. Return buffer on success.
152 
153    buflen is the length of the buffer in (Py_UCS4) characters. */
154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155     PyObject *unicode,
156     Py_UCS4* buffer,
157     Py_ssize_t buflen,
158     int copy_null);
159 
160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
161  * PyMem_Malloc; if this fails, NULL is returned with a memory error
162    exception set. */
163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164 #endif
165 
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Get the length of the Unicode object. */
168 
169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170     PyObject *unicode
171 );
172 #endif
173 
174 /* Get the number of Py_UNICODE units in the
175    string representation. */
176 
177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178     PyObject *unicode           /* Unicode object */
179     );
180 
181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182 /* Read a character from the string. */
183 
184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185     PyObject *unicode,
186     Py_ssize_t index
187     );
188 
189 /* Write a character to the string. The string must have been created through
190    PyUnicode_New, must not be shared, and must not have been hashed yet.
191 
192    Return 0 on success, -1 on error. */
193 
194 PyAPI_FUNC(int) PyUnicode_WriteChar(
195     PyObject *unicode,
196     Py_ssize_t index,
197     Py_UCS4 character
198     );
199 #endif
200 
201 /* Resize a Unicode object. The length is the number of characters, except
202    if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203    is the number of Py_UNICODE characters.
204 
205    *unicode is modified to point to the new (resized) object and 0
206    returned on success.
207 
208    Try to resize the string in place (which is usually faster than allocating
209    a new string and copy characters), or create a new string.
210 
211    Error handling is implemented as follows: an exception is set, -1
212    is returned and *unicode left untouched.
213 
214    WARNING: The function doesn't check string content, the result may not be a
215             string in canonical representation. */
216 
217 PyAPI_FUNC(int) PyUnicode_Resize(
218     PyObject **unicode,         /* Pointer to the Unicode object */
219     Py_ssize_t length           /* New length */
220     );
221 
222 /* Decode obj to a Unicode object.
223 
224    bytes, bytearray and other bytes-like objects are decoded according to the
225    given encoding and error handler. The encoding and error handler can be
226    NULL to have the interface use UTF-8 and "strict".
227 
228    All other objects (including Unicode objects) raise an exception.
229 
230    The API returns NULL in case of an error. The caller is responsible
231    for decref'ing the returned objects.
232 
233 */
234 
235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236     PyObject *obj,              /* Object */
237     const char *encoding,       /* encoding */
238     const char *errors          /* error handling */
239     );
240 
241 /* Copy an instance of a Unicode subtype to a new true Unicode object if
242    necessary. If obj is already a true Unicode object (not a subtype), return
243    the reference with *incremented* refcount.
244 
245    The API returns NULL in case of an error. The caller is responsible
246    for decref'ing the returned objects.
247 
248 */
249 
250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251     PyObject *obj      /* Object */
252     );
253 
254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255     const char *format,   /* ASCII-encoded string  */
256     va_list vargs
257     );
258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259     const char *format,   /* ASCII-encoded string  */
260     ...
261     );
262 
263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265     const char *u              /* UTF-8 encoded string */
266     );
267 
268 // PyUnicode_InternImmortal() is deprecated since Python 3.10
269 // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270 Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271 
272 /* Use only if you know it's a string */
273 #define PyUnicode_CHECK_INTERNED(op) \
274     (((PyASCIIObject *)(op))->state.interned)
275 
276 /* --- wchar_t support for platforms which support it --------------------- */
277 
278 #ifdef HAVE_WCHAR_H
279 
280 /* Create a Unicode Object from the wchar_t buffer w of the given
281    size.
282 
283    The buffer is copied into the new object. */
284 
285 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
286     const wchar_t *w,           /* wchar_t buffer */
287     Py_ssize_t size             /* size of buffer */
288     );
289 
290 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
291    most size wchar_t characters are copied.
292 
293    Note that the resulting wchar_t string may or may not be
294    0-terminated.  It is the responsibility of the caller to make sure
295    that the wchar_t string is 0-terminated in case this is required by
296    the application.
297 
298    Returns the number of wchar_t characters copied (excluding a
299    possibly trailing 0-termination character) or -1 in case of an
300    error. */
301 
302 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
303     PyObject *unicode,          /* Unicode object */
304     wchar_t *w,                 /* wchar_t buffer */
305     Py_ssize_t size             /* size of buffer */
306     );
307 
308 /* Convert the Unicode object to a wide character string. The output string
309    always ends with a nul character. If size is not NULL, write the number of
310    wide characters (excluding the null character) into *size.
311 
312    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
313    on success. On error, returns NULL, *size is undefined and raises a
314    MemoryError. */
315 
316 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
317     PyObject *unicode,          /* Unicode object */
318     Py_ssize_t *size            /* number of characters of the result */
319     );
320 
321 #endif
322 
323 /* --- Unicode ordinals --------------------------------------------------- */
324 
325 /* Create a Unicode Object from the given Unicode code point ordinal.
326 
327    The ordinal must be in range(0x110000). A ValueError is
328    raised in case it is not.
329 
330 */
331 
332 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
333 
334 /* === Builtin Codecs =====================================================
335 
336    Many of these APIs take two arguments encoding and errors. These
337    parameters encoding and errors have the same semantics as the ones
338    of the builtin str() API.
339 
340    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
341 
342    Error handling is set by errors which may also be set to NULL
343    meaning to use the default handling defined for the codec. Default
344    error handling for all builtin codecs is "strict" (ValueErrors are
345    raised).
346 
347    The codecs all use a similar interface. Only deviation from the
348    generic ones are documented.
349 
350 */
351 
352 /* --- Manage the default encoding ---------------------------------------- */
353 
354 /* Returns "utf-8".  */
355 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
356 
357 /* --- Generic Codecs ----------------------------------------------------- */
358 
359 /* Create a Unicode object by decoding the encoded string s of the
360    given size. */
361 
362 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
363     const char *s,              /* encoded string */
364     Py_ssize_t size,            /* size of buffer */
365     const char *encoding,       /* encoding */
366     const char *errors          /* error handling */
367     );
368 
369 /* Decode a Unicode object unicode and return the result as Python
370    object.
371 
372    This API is DEPRECATED. The only supported standard encoding is rot13.
373    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
374    that decode from str. */
375 
376 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
377     PyObject *unicode,          /* Unicode object */
378     const char *encoding,       /* encoding */
379     const char *errors          /* error handling */
380     );
381 
382 /* Decode a Unicode object unicode and return the result as Unicode
383    object.
384 
385    This API is DEPRECATED. The only supported standard encoding is rot13.
386    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
387    that decode from str to str. */
388 
389 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
390     PyObject *unicode,          /* Unicode object */
391     const char *encoding,       /* encoding */
392     const char *errors          /* error handling */
393     );
394 
395 /* Encodes a Unicode object and returns the result as Python
396    object.
397 
398    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
399    since all standard encodings (except rot13) encode str to bytes.
400    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
401    that encode form str to non-bytes. */
402 
403 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
404     PyObject *unicode,          /* Unicode object */
405     const char *encoding,       /* encoding */
406     const char *errors          /* error handling */
407     );
408 
409 /* Encodes a Unicode object and returns the result as Python string
410    object. */
411 
412 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
413     PyObject *unicode,          /* Unicode object */
414     const char *encoding,       /* encoding */
415     const char *errors          /* error handling */
416     );
417 
418 /* Encodes a Unicode object and returns the result as Unicode
419    object.
420 
421    This API is DEPRECATED.  The only supported standard encodings is rot13.
422    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
423    that encode from str to str. */
424 
425 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
426     PyObject *unicode,          /* Unicode object */
427     const char *encoding,       /* encoding */
428     const char *errors          /* error handling */
429     );
430 
431 /* Build an encoding map. */
432 
433 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
434     PyObject* string            /* 256 character map */
435    );
436 
437 /* --- UTF-7 Codecs ------------------------------------------------------- */
438 
439 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
440     const char *string,         /* UTF-7 encoded string */
441     Py_ssize_t length,          /* size of string */
442     const char *errors          /* error handling */
443     );
444 
445 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
446     const char *string,         /* UTF-7 encoded string */
447     Py_ssize_t length,          /* size of string */
448     const char *errors,         /* error handling */
449     Py_ssize_t *consumed        /* bytes consumed */
450     );
451 
452 /* --- UTF-8 Codecs ------------------------------------------------------- */
453 
454 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
455     const char *string,         /* UTF-8 encoded string */
456     Py_ssize_t length,          /* size of string */
457     const char *errors          /* error handling */
458     );
459 
460 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
461     const char *string,         /* UTF-8 encoded string */
462     Py_ssize_t length,          /* size of string */
463     const char *errors,         /* error handling */
464     Py_ssize_t *consumed        /* bytes consumed */
465     );
466 
467 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
468     PyObject *unicode           /* Unicode object */
469     );
470 
471 /* Returns a pointer to the default encoding (UTF-8) of the
472    Unicode object unicode and the size of the encoded representation
473    in bytes stored in *size.
474 
475    In case of an error, no *size is set.
476 
477    This function caches the UTF-8 encoded string in the unicodeobject
478    and subsequent calls will return the same string.  The memory is released
479    when the unicodeobject is deallocated.
480 */
481 
482 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
483 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
484     PyObject *unicode,
485     Py_ssize_t *size);
486 #endif
487 
488 /* --- UTF-32 Codecs ------------------------------------------------------ */
489 
490 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
491    the corresponding Unicode object.
492 
493    errors (if non-NULL) defines the error handling. It defaults
494    to "strict".
495 
496    If byteorder is non-NULL, the decoder starts decoding using the
497    given byte order:
498 
499     *byteorder == -1: little endian
500     *byteorder == 0:  native order
501     *byteorder == 1:  big endian
502 
503    In native mode, the first four bytes of the stream are checked for a
504    BOM mark. If found, the BOM mark is analysed, the byte order
505    adjusted and the BOM skipped.  In the other modes, no BOM mark
506    interpretation is done. After completion, *byteorder is set to the
507    current byte order at the end of input data.
508 
509    If byteorder is NULL, the codec starts in native order mode.
510 
511 */
512 
513 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
514     const char *string,         /* UTF-32 encoded string */
515     Py_ssize_t length,          /* size of string */
516     const char *errors,         /* error handling */
517     int *byteorder              /* pointer to byteorder to use
518                                    0=native;-1=LE,1=BE; updated on
519                                    exit */
520     );
521 
522 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
523     const char *string,         /* UTF-32 encoded string */
524     Py_ssize_t length,          /* size of string */
525     const char *errors,         /* error handling */
526     int *byteorder,             /* pointer to byteorder to use
527                                    0=native;-1=LE,1=BE; updated on
528                                    exit */
529     Py_ssize_t *consumed        /* bytes consumed */
530     );
531 
532 /* Returns a Python string using the UTF-32 encoding in native byte
533    order. The string always starts with a BOM mark.  */
534 
535 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
536     PyObject *unicode           /* Unicode object */
537     );
538 
539 /* Returns a Python string object holding the UTF-32 encoded value of
540    the Unicode data.
541 
542    If byteorder is not 0, output is written according to the following
543    byte order:
544 
545    byteorder == -1: little endian
546    byteorder == 0:  native byte order (writes a BOM mark)
547    byteorder == 1:  big endian
548 
549    If byteorder is 0, the output string will always start with the
550    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
551    prepended.
552 
553 */
554 
555 /* --- UTF-16 Codecs ------------------------------------------------------ */
556 
557 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
558    the corresponding Unicode object.
559 
560    errors (if non-NULL) defines the error handling. It defaults
561    to "strict".
562 
563    If byteorder is non-NULL, the decoder starts decoding using the
564    given byte order:
565 
566     *byteorder == -1: little endian
567     *byteorder == 0:  native order
568     *byteorder == 1:  big endian
569 
570    In native mode, the first two bytes of the stream are checked for a
571    BOM mark. If found, the BOM mark is analysed, the byte order
572    adjusted and the BOM skipped.  In the other modes, no BOM mark
573    interpretation is done. After completion, *byteorder is set to the
574    current byte order at the end of input data.
575 
576    If byteorder is NULL, the codec starts in native order mode.
577 
578 */
579 
580 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
581     const char *string,         /* UTF-16 encoded string */
582     Py_ssize_t length,          /* size of string */
583     const char *errors,         /* error handling */
584     int *byteorder              /* pointer to byteorder to use
585                                    0=native;-1=LE,1=BE; updated on
586                                    exit */
587     );
588 
589 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
590     const char *string,         /* UTF-16 encoded string */
591     Py_ssize_t length,          /* size of string */
592     const char *errors,         /* error handling */
593     int *byteorder,             /* pointer to byteorder to use
594                                    0=native;-1=LE,1=BE; updated on
595                                    exit */
596     Py_ssize_t *consumed        /* bytes consumed */
597     );
598 
599 /* Returns a Python string using the UTF-16 encoding in native byte
600    order. The string always starts with a BOM mark.  */
601 
602 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
603     PyObject *unicode           /* Unicode object */
604     );
605 
606 /* --- Unicode-Escape Codecs ---------------------------------------------- */
607 
608 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
609     const char *string,         /* Unicode-Escape encoded string */
610     Py_ssize_t length,          /* size of string */
611     const char *errors          /* error handling */
612     );
613 
614 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
615     PyObject *unicode           /* Unicode object */
616     );
617 
618 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
619 
620 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
621     const char *string,         /* Raw-Unicode-Escape encoded string */
622     Py_ssize_t length,          /* size of string */
623     const char *errors          /* error handling */
624     );
625 
626 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
627     PyObject *unicode           /* Unicode object */
628     );
629 
630 /* --- Latin-1 Codecs -----------------------------------------------------
631 
632    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
633 
634 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
635     const char *string,         /* Latin-1 encoded string */
636     Py_ssize_t length,          /* size of string */
637     const char *errors          /* error handling */
638     );
639 
640 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
641     PyObject *unicode           /* Unicode object */
642     );
643 
644 /* --- ASCII Codecs -------------------------------------------------------
645 
646    Only 7-bit ASCII data is excepted. All other codes generate errors.
647 
648 */
649 
650 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
651     const char *string,         /* ASCII encoded string */
652     Py_ssize_t length,          /* size of string */
653     const char *errors          /* error handling */
654     );
655 
656 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
657     PyObject *unicode           /* Unicode object */
658     );
659 
660 /* --- Character Map Codecs -----------------------------------------------
661 
662    This codec uses mappings to encode and decode characters.
663 
664    Decoding mappings must map byte ordinals (integers in the range from 0 to
665    255) to Unicode strings, integers (which are then interpreted as Unicode
666    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
667    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
668    mapping" and cause an error.
669 
670    Encoding mappings must map Unicode ordinal integers to bytes objects,
671    integers in the range from 0 to 255 or None.  Unmapped character
672    ordinals (ones which cause a LookupError) as well as mapped to
673    None are treated as "undefined mapping" and cause an error.
674 
675 */
676 
677 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
678     const char *string,         /* Encoded string */
679     Py_ssize_t length,          /* size of string */
680     PyObject *mapping,          /* decoding mapping */
681     const char *errors          /* error handling */
682     );
683 
684 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
685     PyObject *unicode,          /* Unicode object */
686     PyObject *mapping           /* encoding mapping */
687     );
688 
689 /* --- MBCS codecs for Windows -------------------------------------------- */
690 
691 #ifdef MS_WINDOWS
692 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
693     const char *string,         /* MBCS encoded string */
694     Py_ssize_t length,          /* size of string */
695     const char *errors          /* error handling */
696     );
697 
698 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
699     const char *string,         /* MBCS encoded string */
700     Py_ssize_t length,          /* size of string */
701     const char *errors,         /* error handling */
702     Py_ssize_t *consumed        /* bytes consumed */
703     );
704 
705 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
706 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
707     int code_page,              /* code page number */
708     const char *string,         /* encoded string */
709     Py_ssize_t length,          /* size of string */
710     const char *errors,         /* error handling */
711     Py_ssize_t *consumed        /* bytes consumed */
712     );
713 #endif
714 
715 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
716     PyObject *unicode           /* Unicode object */
717     );
718 
719 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
720 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
721     int code_page,              /* code page number */
722     PyObject *unicode,          /* Unicode object */
723     const char *errors          /* error handling */
724     );
725 #endif
726 
727 #endif /* MS_WINDOWS */
728 
729 /* --- Locale encoding --------------------------------------------------- */
730 
731 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
732 /* Decode a string from the current locale encoding. The decoder is strict if
733    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
734    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
735    be decoded as a surrogate character and *surrogateescape* is not equal to
736    zero, the byte sequence is escaped using the 'surrogateescape' error handler
737    instead of being decoded. *str* must end with a null character but cannot
738    contain embedded null characters. */
739 
740 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
741     const char *str,
742     Py_ssize_t len,
743     const char *errors);
744 
745 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
746    length using strlen(). */
747 
748 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
749     const char *str,
750     const char *errors);
751 
752 /* Encode a Unicode object to the current locale encoding. The encoder is
753    strict is *surrogateescape* is equal to zero, otherwise the
754    "surrogateescape" error handler is used. Return a bytes object. The string
755    cannot contain embedded null characters. */
756 
757 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
758     PyObject *unicode,
759     const char *errors
760     );
761 #endif
762 
763 /* --- File system encoding ---------------------------------------------- */
764 
765 /* ParseTuple converter: encode str objects to bytes using
766    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
767 
768 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
769 
770 /* ParseTuple converter: decode bytes objects to unicode using
771    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
772 
773 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
774 
775 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
776    and the "surrogateescape" error handler.
777 
778    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
779    encoding.
780 
781    Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
782 */
783 
784 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
785     const char *s               /* encoded string */
786     );
787 
788 /* Decode a string using Py_FileSystemDefaultEncoding
789    and the "surrogateescape" error handler.
790 
791    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
792    encoding.
793 */
794 
795 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
796     const char *s,               /* encoded string */
797     Py_ssize_t size              /* size */
798     );
799 
800 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
801    "surrogateescape" error handler, and return bytes.
802 
803    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
804    encoding.
805 */
806 
807 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
808     PyObject *unicode
809     );
810 
811 /* --- Methods & Slots ----------------------------------------------------
812 
813    These are capable of handling Unicode objects and strings on input
814    (we refer to them as strings in the descriptions) and return
815    Unicode objects or integers as appropriate. */
816 
817 /* Concat two strings giving a new Unicode string. */
818 
819 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
820     PyObject *left,             /* Left string */
821     PyObject *right             /* Right string */
822     );
823 
824 /* Concat two strings and put the result in *pleft
825    (sets *pleft to NULL on error) */
826 
827 PyAPI_FUNC(void) PyUnicode_Append(
828     PyObject **pleft,           /* Pointer to left string */
829     PyObject *right             /* Right string */
830     );
831 
832 /* Concat two strings, put the result in *pleft and drop the right object
833    (sets *pleft to NULL on error) */
834 
835 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
836     PyObject **pleft,           /* Pointer to left string */
837     PyObject *right             /* Right string */
838     );
839 
840 /* Split a string giving a list of Unicode strings.
841 
842    If sep is NULL, splitting will be done at all whitespace
843    substrings. Otherwise, splits occur at the given separator.
844 
845    At most maxsplit splits will be done. If negative, no limit is set.
846 
847    Separators are not included in the resulting list.
848 
849 */
850 
851 PyAPI_FUNC(PyObject*) PyUnicode_Split(
852     PyObject *s,                /* String to split */
853     PyObject *sep,              /* String separator */
854     Py_ssize_t maxsplit         /* Maxsplit count */
855     );
856 
857 /* Dito, but split at line breaks.
858 
859    CRLF is considered to be one line break. Line breaks are not
860    included in the resulting list. */
861 
862 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
863     PyObject *s,                /* String to split */
864     int keepends                /* If true, line end markers are included */
865     );
866 
867 /* Partition a string using a given separator. */
868 
869 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
870     PyObject *s,                /* String to partition */
871     PyObject *sep               /* String separator */
872     );
873 
874 /* Partition a string using a given separator, searching from the end of the
875    string. */
876 
877 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
878     PyObject *s,                /* String to partition */
879     PyObject *sep               /* String separator */
880     );
881 
882 /* Split a string giving a list of Unicode strings.
883 
884    If sep is NULL, splitting will be done at all whitespace
885    substrings. Otherwise, splits occur at the given separator.
886 
887    At most maxsplit splits will be done. But unlike PyUnicode_Split
888    PyUnicode_RSplit splits from the end of the string. If negative,
889    no limit is set.
890 
891    Separators are not included in the resulting list.
892 
893 */
894 
895 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
896     PyObject *s,                /* String to split */
897     PyObject *sep,              /* String separator */
898     Py_ssize_t maxsplit         /* Maxsplit count */
899     );
900 
901 /* Translate a string by applying a character mapping table to it and
902    return the resulting Unicode object.
903 
904    The mapping table must map Unicode ordinal integers to Unicode strings,
905    Unicode ordinal integers or None (causing deletion of the character).
906 
907    Mapping tables may be dictionaries or sequences. Unmapped character
908    ordinals (ones which cause a LookupError) are left untouched and
909    are copied as-is.
910 
911 */
912 
913 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
914     PyObject *str,              /* String */
915     PyObject *table,            /* Translate table */
916     const char *errors          /* error handling */
917     );
918 
919 /* Join a sequence of strings using the given separator and return
920    the resulting Unicode string. */
921 
922 PyAPI_FUNC(PyObject*) PyUnicode_Join(
923     PyObject *separator,        /* Separator string */
924     PyObject *seq               /* Sequence object */
925     );
926 
927 /* Return 1 if substr matches str[start:end] at the given tail end, 0
928    otherwise. */
929 
930 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
931     PyObject *str,              /* String */
932     PyObject *substr,           /* Prefix or Suffix string */
933     Py_ssize_t start,           /* Start index */
934     Py_ssize_t end,             /* Stop index */
935     int direction               /* Tail end: -1 prefix, +1 suffix */
936     );
937 
938 /* Return the first position of substr in str[start:end] using the
939    given search direction or -1 if not found. -2 is returned in case
940    an error occurred and an exception is set. */
941 
942 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
943     PyObject *str,              /* String */
944     PyObject *substr,           /* Substring to find */
945     Py_ssize_t start,           /* Start index */
946     Py_ssize_t end,             /* Stop index */
947     int direction               /* Find direction: +1 forward, -1 backward */
948     );
949 
950 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
951 /* Like PyUnicode_Find, but search for single character only. */
952 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
953     PyObject *str,
954     Py_UCS4 ch,
955     Py_ssize_t start,
956     Py_ssize_t end,
957     int direction
958     );
959 #endif
960 
961 /* Count the number of occurrences of substr in str[start:end]. */
962 
963 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
964     PyObject *str,              /* String */
965     PyObject *substr,           /* Substring to count */
966     Py_ssize_t start,           /* Start index */
967     Py_ssize_t end              /* Stop index */
968     );
969 
970 /* Replace at most maxcount occurrences of substr in str with replstr
971    and return the resulting Unicode object. */
972 
973 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
974     PyObject *str,              /* String */
975     PyObject *substr,           /* Substring to find */
976     PyObject *replstr,          /* Substring to replace */
977     Py_ssize_t maxcount         /* Max. number of replacements to apply;
978                                    -1 = all */
979     );
980 
981 /* Compare two strings and return -1, 0, 1 for less than, equal,
982    greater than resp.
983    Raise an exception and return -1 on error. */
984 
985 PyAPI_FUNC(int) PyUnicode_Compare(
986     PyObject *left,             /* Left string */
987     PyObject *right             /* Right string */
988     );
989 
990 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
991    equal, and greater than, respectively.  It is best to pass only
992    ASCII-encoded strings, but the function interprets the input string as
993    ISO-8859-1 if it contains non-ASCII characters.
994    This function does not raise exceptions. */
995 
996 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
997     PyObject *left,
998     const char *right           /* ASCII-encoded string */
999     );
1000 
1001 /* Rich compare two strings and return one of the following:
1002 
1003    - NULL in case an exception was raised
1004    - Py_True or Py_False for successful comparisons
1005    - Py_NotImplemented in case the type combination is unknown
1006 
1007    Possible values for op:
1008 
1009      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1010 
1011 */
1012 
1013 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1014     PyObject *left,             /* Left string */
1015     PyObject *right,            /* Right string */
1016     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1017     );
1018 
1019 /* Apply an argument tuple or dictionary to a format string and return
1020    the resulting Unicode string. */
1021 
1022 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1023     PyObject *format,           /* Format string */
1024     PyObject *args              /* Argument tuple or dictionary */
1025     );
1026 
1027 /* Checks whether element is contained in container and return 1/0
1028    accordingly.
1029 
1030    element has to coerce to a one element Unicode string. -1 is
1031    returned in case of an error. */
1032 
1033 PyAPI_FUNC(int) PyUnicode_Contains(
1034     PyObject *container,        /* Container string */
1035     PyObject *element           /* Element string */
1036     );
1037 
1038 /* Checks whether argument is a valid identifier. */
1039 
1040 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1041 
1042 /* === Characters Type APIs =============================================== */
1043 
1044 #ifndef Py_LIMITED_API
1045 #  define Py_CPYTHON_UNICODEOBJECT_H
1046 #  include  "cpython/unicodeobject.h"
1047 #  undef Py_CPYTHON_UNICODEOBJECT_H
1048 #endif
1049 
1050 #ifdef __cplusplus
1051 }
1052 #endif
1053 #endif /* !Py_UNICODEOBJECT_H */
1054