• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 #include <stdarg.h>
5 
6 /*
7 
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12 
13 Copyright (c) Corporation for National Research Initiatives.
14 
15 
16  Original header:
17  --------------------------------------------------------------------
18 
19  * Yet another Unicode string type for Python.  This type supports the
20  * 16-bit Basic Multilingual Plane (BMP) only.
21  *
22  * Written by Fredrik Lundh, January 1999.
23  *
24  * Copyright (c) 1999 by Secret Labs AB.
25  * Copyright (c) 1999 by Fredrik Lundh.
26  *
27  * fredrik@pythonware.com
28  * http://www.pythonware.com
29  *
30  * --------------------------------------------------------------------
31  * This Unicode String Type is
32  *
33  * Copyright (c) 1999 by Secret Labs AB
34  * Copyright (c) 1999 by Fredrik Lundh
35  *
36  * By obtaining, using, and/or copying this software and/or its
37  * associated documentation, you agree that you have read, understood,
38  * and will comply with the following terms and conditions:
39  *
40  * Permission to use, copy, modify, and distribute this software and its
41  * associated documentation for any purpose and without fee is hereby
42  * granted, provided that the above copyright notice appears in all
43  * copies, and that both that copyright notice and this permission notice
44  * appear in supporting documentation, and that the name of Secret Labs
45  * AB or the author not be used in advertising or publicity pertaining to
46  * distribution of the software without specific, written prior
47  * permission.
48  *
49  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56  * -------------------------------------------------------------------- */
57 
58 #include <ctype.h>
59 
60 /* === Internal API ======================================================= */
61 
62 /* --- Internal Unicode Format -------------------------------------------- */
63 
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66 
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70 
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72 
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74    Otherwise, Unicode strings are stored as UCS-2 (with limited support
75    for UTF-16) */
76 
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80 
81 /* Set these flags if the platform has "wchar.h" and the
82    wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85 
86 /* If the compiler provides a wchar_t type we try to support it
87    through the interface functions PyUnicode_FromWideChar(),
88    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89 
90 #ifdef HAVE_USABLE_WCHAR_T
91 # ifndef HAVE_WCHAR_H
92 #  define HAVE_WCHAR_H
93 # endif
94 #endif
95 
96 #ifdef HAVE_WCHAR_H
97 #  include <wchar.h>
98 #endif
99 
100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
101    unicode representations. */
102 typedef uint32_t Py_UCS4;
103 typedef uint16_t Py_UCS2;
104 typedef uint8_t Py_UCS1;
105 
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109 
110 
111 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113 
114 #define PyUnicode_Check(op) \
115                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
117 
118 /* --- Constants ---------------------------------------------------------- */
119 
120 /* This Unicode character will be used as replacement character during
121    decoding if the errors argument is set to "replace". Note: the
122    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123    Unicode 3.0. */
124 
125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126 
127 /* === Public API ========================================================= */
128 
129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131     const char *u,             /* UTF-8 encoded string */
132     Py_ssize_t size            /* size of buffer */
133     );
134 
135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136    UTF-8 encoded bytes.  The size is determined with strlen(). */
137 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138     const char *u              /* UTF-8 encoded string */
139     );
140 
141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143     PyObject *str,
144     Py_ssize_t start,
145     Py_ssize_t end);
146 #endif
147 
148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149 /* Copy the string into a UCS4 buffer including the null character if copy_null
150    is set. Return NULL and raise an exception on error. Raise a SystemError if
151    the buffer is smaller than the string. Return buffer on success.
152 
153    buflen is the length of the buffer in (Py_UCS4) characters. */
154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155     PyObject *unicode,
156     Py_UCS4* buffer,
157     Py_ssize_t buflen,
158     int copy_null);
159 
160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
161  * PyMem_Malloc; if this fails, NULL is returned with a memory error
162    exception set. */
163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164 #endif
165 
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Get the length of the Unicode object. */
168 
169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170     PyObject *unicode
171 );
172 #endif
173 
174 /* Get the number of Py_UNICODE units in the
175    string representation. */
176 
177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178     PyObject *unicode           /* Unicode object */
179     );
180 
181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182 /* Read a character from the string. */
183 
184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185     PyObject *unicode,
186     Py_ssize_t index
187     );
188 
189 /* Write a character to the string. The string must have been created through
190    PyUnicode_New, must not be shared, and must not have been hashed yet.
191 
192    Return 0 on success, -1 on error. */
193 
194 PyAPI_FUNC(int) PyUnicode_WriteChar(
195     PyObject *unicode,
196     Py_ssize_t index,
197     Py_UCS4 character
198     );
199 #endif
200 
201 /* Resize a Unicode object. The length is the number of characters, except
202    if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203    is the number of Py_UNICODE characters.
204 
205    *unicode is modified to point to the new (resized) object and 0
206    returned on success.
207 
208    Try to resize the string in place (which is usually faster than allocating
209    a new string and copy characters), or create a new string.
210 
211    Error handling is implemented as follows: an exception is set, -1
212    is returned and *unicode left untouched.
213 
214    WARNING: The function doesn't check string content, the result may not be a
215             string in canonical representation. */
216 
217 PyAPI_FUNC(int) PyUnicode_Resize(
218     PyObject **unicode,         /* Pointer to the Unicode object */
219     Py_ssize_t length           /* New length */
220     );
221 
222 /* Decode obj to a Unicode object.
223 
224    bytes, bytearray and other bytes-like objects are decoded according to the
225    given encoding and error handler. The encoding and error handler can be
226    NULL to have the interface use UTF-8 and "strict".
227 
228    All other objects (including Unicode objects) raise an exception.
229 
230    The API returns NULL in case of an error. The caller is responsible
231    for decref'ing the returned objects.
232 
233 */
234 
235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236     PyObject *obj,              /* Object */
237     const char *encoding,       /* encoding */
238     const char *errors          /* error handling */
239     );
240 
241 /* Copy an instance of a Unicode subtype to a new true Unicode object if
242    necessary. If obj is already a true Unicode object (not a subtype), return
243    the reference with *incremented* refcount.
244 
245    The API returns NULL in case of an error. The caller is responsible
246    for decref'ing the returned objects.
247 
248 */
249 
250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251     PyObject *obj      /* Object */
252     );
253 
254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255     const char *format,   /* ASCII-encoded string  */
256     va_list vargs
257     );
258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259     const char *format,   /* ASCII-encoded string  */
260     ...
261     );
262 
263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
265 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266     const char *u              /* UTF-8 encoded string */
267     );
268 
269 /* Use only if you know it's a string */
270 #define PyUnicode_CHECK_INTERNED(op) \
271     (((PyASCIIObject *)(op))->state.interned)
272 
273 /* --- wchar_t support for platforms which support it --------------------- */
274 
275 #ifdef HAVE_WCHAR_H
276 
277 /* Create a Unicode Object from the wchar_t buffer w of the given
278    size.
279 
280    The buffer is copied into the new object. */
281 
282 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
283     const wchar_t *w,           /* wchar_t buffer */
284     Py_ssize_t size             /* size of buffer */
285     );
286 
287 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
288    most size wchar_t characters are copied.
289 
290    Note that the resulting wchar_t string may or may not be
291    0-terminated.  It is the responsibility of the caller to make sure
292    that the wchar_t string is 0-terminated in case this is required by
293    the application.
294 
295    Returns the number of wchar_t characters copied (excluding a
296    possibly trailing 0-termination character) or -1 in case of an
297    error. */
298 
299 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
300     PyObject *unicode,          /* Unicode object */
301     wchar_t *w,                 /* wchar_t buffer */
302     Py_ssize_t size             /* size of buffer */
303     );
304 
305 /* Convert the Unicode object to a wide character string. The output string
306    always ends with a nul character. If size is not NULL, write the number of
307    wide characters (excluding the null character) into *size.
308 
309    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
310    on success. On error, returns NULL, *size is undefined and raises a
311    MemoryError. */
312 
313 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
314     PyObject *unicode,          /* Unicode object */
315     Py_ssize_t *size            /* number of characters of the result */
316     );
317 
318 #endif
319 
320 /* --- Unicode ordinals --------------------------------------------------- */
321 
322 /* Create a Unicode Object from the given Unicode code point ordinal.
323 
324    The ordinal must be in range(0x110000). A ValueError is
325    raised in case it is not.
326 
327 */
328 
329 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
330 
331 /* --- Free-list management ----------------------------------------------- */
332 
333 /* Clear the free list used by the Unicode implementation.
334 
335    This can be used to release memory used for objects on the free
336    list back to the Python memory allocator.
337 
338 */
339 
340 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
341 
342 /* === Builtin Codecs =====================================================
343 
344    Many of these APIs take two arguments encoding and errors. These
345    parameters encoding and errors have the same semantics as the ones
346    of the builtin str() API.
347 
348    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
349 
350    Error handling is set by errors which may also be set to NULL
351    meaning to use the default handling defined for the codec. Default
352    error handling for all builtin codecs is "strict" (ValueErrors are
353    raised).
354 
355    The codecs all use a similar interface. Only deviation from the
356    generic ones are documented.
357 
358 */
359 
360 /* --- Manage the default encoding ---------------------------------------- */
361 
362 /* Returns "utf-8".  */
363 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
364 
365 /* --- Generic Codecs ----------------------------------------------------- */
366 
367 /* Create a Unicode object by decoding the encoded string s of the
368    given size. */
369 
370 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
371     const char *s,              /* encoded string */
372     Py_ssize_t size,            /* size of buffer */
373     const char *encoding,       /* encoding */
374     const char *errors          /* error handling */
375     );
376 
377 /* Decode a Unicode object unicode and return the result as Python
378    object.
379 
380    This API is DEPRECATED. The only supported standard encoding is rot13.
381    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
382    that decode from str. */
383 
384 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
385     PyObject *unicode,          /* Unicode object */
386     const char *encoding,       /* encoding */
387     const char *errors          /* error handling */
388     );
389 
390 /* Decode a Unicode object unicode and return the result as Unicode
391    object.
392 
393    This API is DEPRECATED. The only supported standard encoding is rot13.
394    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
395    that decode from str to str. */
396 
397 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
398     PyObject *unicode,          /* Unicode object */
399     const char *encoding,       /* encoding */
400     const char *errors          /* error handling */
401     );
402 
403 /* Encodes a Unicode object and returns the result as Python
404    object.
405 
406    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
407    since all standard encodings (except rot13) encode str to bytes.
408    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
409    that encode form str to non-bytes. */
410 
411 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
412     PyObject *unicode,          /* Unicode object */
413     const char *encoding,       /* encoding */
414     const char *errors          /* error handling */
415     );
416 
417 /* Encodes a Unicode object and returns the result as Python string
418    object. */
419 
420 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
421     PyObject *unicode,          /* Unicode object */
422     const char *encoding,       /* encoding */
423     const char *errors          /* error handling */
424     );
425 
426 /* Encodes a Unicode object and returns the result as Unicode
427    object.
428 
429    This API is DEPRECATED.  The only supported standard encodings is rot13.
430    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
431    that encode from str to str. */
432 
433 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
434     PyObject *unicode,          /* Unicode object */
435     const char *encoding,       /* encoding */
436     const char *errors          /* error handling */
437     );
438 
439 /* Build an encoding map. */
440 
441 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
442     PyObject* string            /* 256 character map */
443    );
444 
445 /* --- UTF-7 Codecs ------------------------------------------------------- */
446 
447 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
448     const char *string,         /* UTF-7 encoded string */
449     Py_ssize_t length,          /* size of string */
450     const char *errors          /* error handling */
451     );
452 
453 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
454     const char *string,         /* UTF-7 encoded string */
455     Py_ssize_t length,          /* size of string */
456     const char *errors,         /* error handling */
457     Py_ssize_t *consumed        /* bytes consumed */
458     );
459 
460 /* --- UTF-8 Codecs ------------------------------------------------------- */
461 
462 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
463     const char *string,         /* UTF-8 encoded string */
464     Py_ssize_t length,          /* size of string */
465     const char *errors          /* error handling */
466     );
467 
468 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
469     const char *string,         /* UTF-8 encoded string */
470     Py_ssize_t length,          /* size of string */
471     const char *errors,         /* error handling */
472     Py_ssize_t *consumed        /* bytes consumed */
473     );
474 
475 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
476     PyObject *unicode           /* Unicode object */
477     );
478 
479 /* --- UTF-32 Codecs ------------------------------------------------------ */
480 
481 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
482    the corresponding Unicode object.
483 
484    errors (if non-NULL) defines the error handling. It defaults
485    to "strict".
486 
487    If byteorder is non-NULL, the decoder starts decoding using the
488    given byte order:
489 
490     *byteorder == -1: little endian
491     *byteorder == 0:  native order
492     *byteorder == 1:  big endian
493 
494    In native mode, the first four bytes of the stream are checked for a
495    BOM mark. If found, the BOM mark is analysed, the byte order
496    adjusted and the BOM skipped.  In the other modes, no BOM mark
497    interpretation is done. After completion, *byteorder is set to the
498    current byte order at the end of input data.
499 
500    If byteorder is NULL, the codec starts in native order mode.
501 
502 */
503 
504 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
505     const char *string,         /* UTF-32 encoded string */
506     Py_ssize_t length,          /* size of string */
507     const char *errors,         /* error handling */
508     int *byteorder              /* pointer to byteorder to use
509                                    0=native;-1=LE,1=BE; updated on
510                                    exit */
511     );
512 
513 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
514     const char *string,         /* UTF-32 encoded string */
515     Py_ssize_t length,          /* size of string */
516     const char *errors,         /* error handling */
517     int *byteorder,             /* pointer to byteorder to use
518                                    0=native;-1=LE,1=BE; updated on
519                                    exit */
520     Py_ssize_t *consumed        /* bytes consumed */
521     );
522 
523 /* Returns a Python string using the UTF-32 encoding in native byte
524    order. The string always starts with a BOM mark.  */
525 
526 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
527     PyObject *unicode           /* Unicode object */
528     );
529 
530 /* Returns a Python string object holding the UTF-32 encoded value of
531    the Unicode data.
532 
533    If byteorder is not 0, output is written according to the following
534    byte order:
535 
536    byteorder == -1: little endian
537    byteorder == 0:  native byte order (writes a BOM mark)
538    byteorder == 1:  big endian
539 
540    If byteorder is 0, the output string will always start with the
541    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
542    prepended.
543 
544 */
545 
546 /* --- UTF-16 Codecs ------------------------------------------------------ */
547 
548 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
549    the corresponding Unicode object.
550 
551    errors (if non-NULL) defines the error handling. It defaults
552    to "strict".
553 
554    If byteorder is non-NULL, the decoder starts decoding using the
555    given byte order:
556 
557     *byteorder == -1: little endian
558     *byteorder == 0:  native order
559     *byteorder == 1:  big endian
560 
561    In native mode, the first two bytes of the stream are checked for a
562    BOM mark. If found, the BOM mark is analysed, the byte order
563    adjusted and the BOM skipped.  In the other modes, no BOM mark
564    interpretation is done. After completion, *byteorder is set to the
565    current byte order at the end of input data.
566 
567    If byteorder is NULL, the codec starts in native order mode.
568 
569 */
570 
571 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
572     const char *string,         /* UTF-16 encoded string */
573     Py_ssize_t length,          /* size of string */
574     const char *errors,         /* error handling */
575     int *byteorder              /* pointer to byteorder to use
576                                    0=native;-1=LE,1=BE; updated on
577                                    exit */
578     );
579 
580 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
581     const char *string,         /* UTF-16 encoded string */
582     Py_ssize_t length,          /* size of string */
583     const char *errors,         /* error handling */
584     int *byteorder,             /* pointer to byteorder to use
585                                    0=native;-1=LE,1=BE; updated on
586                                    exit */
587     Py_ssize_t *consumed        /* bytes consumed */
588     );
589 
590 /* Returns a Python string using the UTF-16 encoding in native byte
591    order. The string always starts with a BOM mark.  */
592 
593 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
594     PyObject *unicode           /* Unicode object */
595     );
596 
597 /* --- Unicode-Escape Codecs ---------------------------------------------- */
598 
599 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
600     const char *string,         /* Unicode-Escape encoded string */
601     Py_ssize_t length,          /* size of string */
602     const char *errors          /* error handling */
603     );
604 
605 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
606     PyObject *unicode           /* Unicode object */
607     );
608 
609 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
610 
611 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
612     const char *string,         /* Raw-Unicode-Escape encoded string */
613     Py_ssize_t length,          /* size of string */
614     const char *errors          /* error handling */
615     );
616 
617 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
618     PyObject *unicode           /* Unicode object */
619     );
620 
621 /* --- Latin-1 Codecs -----------------------------------------------------
622 
623    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
624 
625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
626     const char *string,         /* Latin-1 encoded string */
627     Py_ssize_t length,          /* size of string */
628     const char *errors          /* error handling */
629     );
630 
631 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
632     PyObject *unicode           /* Unicode object */
633     );
634 
635 /* --- ASCII Codecs -------------------------------------------------------
636 
637    Only 7-bit ASCII data is excepted. All other codes generate errors.
638 
639 */
640 
641 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
642     const char *string,         /* ASCII encoded string */
643     Py_ssize_t length,          /* size of string */
644     const char *errors          /* error handling */
645     );
646 
647 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
648     PyObject *unicode           /* Unicode object */
649     );
650 
651 /* --- Character Map Codecs -----------------------------------------------
652 
653    This codec uses mappings to encode and decode characters.
654 
655    Decoding mappings must map byte ordinals (integers in the range from 0 to
656    255) to Unicode strings, integers (which are then interpreted as Unicode
657    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
658    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
659    mapping" and cause an error.
660 
661    Encoding mappings must map Unicode ordinal integers to bytes objects,
662    integers in the range from 0 to 255 or None.  Unmapped character
663    ordinals (ones which cause a LookupError) as well as mapped to
664    None are treated as "undefined mapping" and cause an error.
665 
666 */
667 
668 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
669     const char *string,         /* Encoded string */
670     Py_ssize_t length,          /* size of string */
671     PyObject *mapping,          /* decoding mapping */
672     const char *errors          /* error handling */
673     );
674 
675 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
676     PyObject *unicode,          /* Unicode object */
677     PyObject *mapping           /* encoding mapping */
678     );
679 
680 /* --- MBCS codecs for Windows -------------------------------------------- */
681 
682 #ifdef MS_WINDOWS
683 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
684     const char *string,         /* MBCS encoded string */
685     Py_ssize_t length,          /* size of string */
686     const char *errors          /* error handling */
687     );
688 
689 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
690     const char *string,         /* MBCS encoded string */
691     Py_ssize_t length,          /* size of string */
692     const char *errors,         /* error handling */
693     Py_ssize_t *consumed        /* bytes consumed */
694     );
695 
696 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
697 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
698     int code_page,              /* code page number */
699     const char *string,         /* encoded string */
700     Py_ssize_t length,          /* size of string */
701     const char *errors,         /* error handling */
702     Py_ssize_t *consumed        /* bytes consumed */
703     );
704 #endif
705 
706 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
707     PyObject *unicode           /* Unicode object */
708     );
709 
710 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
711 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
712     int code_page,              /* code page number */
713     PyObject *unicode,          /* Unicode object */
714     const char *errors          /* error handling */
715     );
716 #endif
717 
718 #endif /* MS_WINDOWS */
719 
720 /* --- Locale encoding --------------------------------------------------- */
721 
722 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
723 /* Decode a string from the current locale encoding. The decoder is strict if
724    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
725    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
726    be decoded as a surrogate character and *surrogateescape* is not equal to
727    zero, the byte sequence is escaped using the 'surrogateescape' error handler
728    instead of being decoded. *str* must end with a null character but cannot
729    contain embedded null characters. */
730 
731 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
732     const char *str,
733     Py_ssize_t len,
734     const char *errors);
735 
736 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
737    length using strlen(). */
738 
739 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
740     const char *str,
741     const char *errors);
742 
743 /* Encode a Unicode object to the current locale encoding. The encoder is
744    strict is *surrogateescape* is equal to zero, otherwise the
745    "surrogateescape" error handler is used. Return a bytes object. The string
746    cannot contain embedded null characters. */
747 
748 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
749     PyObject *unicode,
750     const char *errors
751     );
752 #endif
753 
754 /* --- File system encoding ---------------------------------------------- */
755 
756 /* ParseTuple converter: encode str objects to bytes using
757    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
758 
759 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
760 
761 /* ParseTuple converter: decode bytes objects to unicode using
762    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
763 
764 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
765 
766 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
767    and the "surrogateescape" error handler.
768 
769    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
770    encoding.
771 
772    Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
773 */
774 
775 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
776     const char *s               /* encoded string */
777     );
778 
779 /* Decode a string using Py_FileSystemDefaultEncoding
780    and the "surrogateescape" error handler.
781 
782    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
783    encoding.
784 */
785 
786 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
787     const char *s,               /* encoded string */
788     Py_ssize_t size              /* size */
789     );
790 
791 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
792    "surrogateescape" error handler, and return bytes.
793 
794    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
795    encoding.
796 */
797 
798 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
799     PyObject *unicode
800     );
801 
802 /* --- Methods & Slots ----------------------------------------------------
803 
804    These are capable of handling Unicode objects and strings on input
805    (we refer to them as strings in the descriptions) and return
806    Unicode objects or integers as appropriate. */
807 
808 /* Concat two strings giving a new Unicode string. */
809 
810 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
811     PyObject *left,             /* Left string */
812     PyObject *right             /* Right string */
813     );
814 
815 /* Concat two strings and put the result in *pleft
816    (sets *pleft to NULL on error) */
817 
818 PyAPI_FUNC(void) PyUnicode_Append(
819     PyObject **pleft,           /* Pointer to left string */
820     PyObject *right             /* Right string */
821     );
822 
823 /* Concat two strings, put the result in *pleft and drop the right object
824    (sets *pleft to NULL on error) */
825 
826 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
827     PyObject **pleft,           /* Pointer to left string */
828     PyObject *right             /* Right string */
829     );
830 
831 /* Split a string giving a list of Unicode strings.
832 
833    If sep is NULL, splitting will be done at all whitespace
834    substrings. Otherwise, splits occur at the given separator.
835 
836    At most maxsplit splits will be done. If negative, no limit is set.
837 
838    Separators are not included in the resulting list.
839 
840 */
841 
842 PyAPI_FUNC(PyObject*) PyUnicode_Split(
843     PyObject *s,                /* String to split */
844     PyObject *sep,              /* String separator */
845     Py_ssize_t maxsplit         /* Maxsplit count */
846     );
847 
848 /* Dito, but split at line breaks.
849 
850    CRLF is considered to be one line break. Line breaks are not
851    included in the resulting list. */
852 
853 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
854     PyObject *s,                /* String to split */
855     int keepends                /* If true, line end markers are included */
856     );
857 
858 /* Partition a string using a given separator. */
859 
860 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
861     PyObject *s,                /* String to partition */
862     PyObject *sep               /* String separator */
863     );
864 
865 /* Partition a string using a given separator, searching from the end of the
866    string. */
867 
868 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
869     PyObject *s,                /* String to partition */
870     PyObject *sep               /* String separator */
871     );
872 
873 /* Split a string giving a list of Unicode strings.
874 
875    If sep is NULL, splitting will be done at all whitespace
876    substrings. Otherwise, splits occur at the given separator.
877 
878    At most maxsplit splits will be done. But unlike PyUnicode_Split
879    PyUnicode_RSplit splits from the end of the string. If negative,
880    no limit is set.
881 
882    Separators are not included in the resulting list.
883 
884 */
885 
886 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
887     PyObject *s,                /* String to split */
888     PyObject *sep,              /* String separator */
889     Py_ssize_t maxsplit         /* Maxsplit count */
890     );
891 
892 /* Translate a string by applying a character mapping table to it and
893    return the resulting Unicode object.
894 
895    The mapping table must map Unicode ordinal integers to Unicode strings,
896    Unicode ordinal integers or None (causing deletion of the character).
897 
898    Mapping tables may be dictionaries or sequences. Unmapped character
899    ordinals (ones which cause a LookupError) are left untouched and
900    are copied as-is.
901 
902 */
903 
904 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
905     PyObject *str,              /* String */
906     PyObject *table,            /* Translate table */
907     const char *errors          /* error handling */
908     );
909 
910 /* Join a sequence of strings using the given separator and return
911    the resulting Unicode string. */
912 
913 PyAPI_FUNC(PyObject*) PyUnicode_Join(
914     PyObject *separator,        /* Separator string */
915     PyObject *seq               /* Sequence object */
916     );
917 
918 /* Return 1 if substr matches str[start:end] at the given tail end, 0
919    otherwise. */
920 
921 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
922     PyObject *str,              /* String */
923     PyObject *substr,           /* Prefix or Suffix string */
924     Py_ssize_t start,           /* Start index */
925     Py_ssize_t end,             /* Stop index */
926     int direction               /* Tail end: -1 prefix, +1 suffix */
927     );
928 
929 /* Return the first position of substr in str[start:end] using the
930    given search direction or -1 if not found. -2 is returned in case
931    an error occurred and an exception is set. */
932 
933 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
934     PyObject *str,              /* String */
935     PyObject *substr,           /* Substring to find */
936     Py_ssize_t start,           /* Start index */
937     Py_ssize_t end,             /* Stop index */
938     int direction               /* Find direction: +1 forward, -1 backward */
939     );
940 
941 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
942 /* Like PyUnicode_Find, but search for single character only. */
943 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
944     PyObject *str,
945     Py_UCS4 ch,
946     Py_ssize_t start,
947     Py_ssize_t end,
948     int direction
949     );
950 #endif
951 
952 /* Count the number of occurrences of substr in str[start:end]. */
953 
954 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
955     PyObject *str,              /* String */
956     PyObject *substr,           /* Substring to count */
957     Py_ssize_t start,           /* Start index */
958     Py_ssize_t end              /* Stop index */
959     );
960 
961 /* Replace at most maxcount occurrences of substr in str with replstr
962    and return the resulting Unicode object. */
963 
964 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
965     PyObject *str,              /* String */
966     PyObject *substr,           /* Substring to find */
967     PyObject *replstr,          /* Substring to replace */
968     Py_ssize_t maxcount         /* Max. number of replacements to apply;
969                                    -1 = all */
970     );
971 
972 /* Compare two strings and return -1, 0, 1 for less than, equal,
973    greater than resp.
974    Raise an exception and return -1 on error. */
975 
976 PyAPI_FUNC(int) PyUnicode_Compare(
977     PyObject *left,             /* Left string */
978     PyObject *right             /* Right string */
979     );
980 
981 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
982    equal, and greater than, respectively.  It is best to pass only
983    ASCII-encoded strings, but the function interprets the input string as
984    ISO-8859-1 if it contains non-ASCII characters.
985    This function does not raise exceptions. */
986 
987 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
988     PyObject *left,
989     const char *right           /* ASCII-encoded string */
990     );
991 
992 /* Rich compare two strings and return one of the following:
993 
994    - NULL in case an exception was raised
995    - Py_True or Py_False for successful comparisons
996    - Py_NotImplemented in case the type combination is unknown
997 
998    Possible values for op:
999 
1000      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1001 
1002 */
1003 
1004 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1005     PyObject *left,             /* Left string */
1006     PyObject *right,            /* Right string */
1007     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1008     );
1009 
1010 /* Apply an argument tuple or dictionary to a format string and return
1011    the resulting Unicode string. */
1012 
1013 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1014     PyObject *format,           /* Format string */
1015     PyObject *args              /* Argument tuple or dictionary */
1016     );
1017 
1018 /* Checks whether element is contained in container and return 1/0
1019    accordingly.
1020 
1021    element has to coerce to a one element Unicode string. -1 is
1022    returned in case of an error. */
1023 
1024 PyAPI_FUNC(int) PyUnicode_Contains(
1025     PyObject *container,        /* Container string */
1026     PyObject *element           /* Element string */
1027     );
1028 
1029 /* Checks whether argument is a valid identifier. */
1030 
1031 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1032 
1033 /* === Characters Type APIs =============================================== */
1034 
1035 #ifndef Py_LIMITED_API
1036 #  define Py_CPYTHON_UNICODEOBJECT_H
1037 #  include  "cpython/unicodeobject.h"
1038 #  undef Py_CPYTHON_UNICODEOBJECT_H
1039 #endif
1040 
1041 #ifdef __cplusplus
1042 }
1043 #endif
1044 #endif /* !Py_UNICODEOBJECT_H */
1045