1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 /* 5 6 Unicode implementation based on original code by Fredrik Lundh, 7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 8 Unicode Integration Proposal. (See 9 http://www.egenix.com/files/python/unicode-proposal.txt). 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 14 Original header: 15 -------------------------------------------------------------------- 16 17 * Yet another Unicode string type for Python. This type supports the 18 * 16-bit Basic Multilingual Plane (BMP) only. 19 * 20 * Written by Fredrik Lundh, January 1999. 21 * 22 * Copyright (c) 1999 by Secret Labs AB. 23 * Copyright (c) 1999 by Fredrik Lundh. 24 * 25 * fredrik@pythonware.com 26 * http://www.pythonware.com 27 * 28 * -------------------------------------------------------------------- 29 * This Unicode String Type is 30 * 31 * Copyright (c) 1999 by Secret Labs AB 32 * Copyright (c) 1999 by Fredrik Lundh 33 * 34 * By obtaining, using, and/or copying this software and/or its 35 * associated documentation, you agree that you have read, understood, 36 * and will comply with the following terms and conditions: 37 * 38 * Permission to use, copy, modify, and distribute this software and its 39 * associated documentation for any purpose and without fee is hereby 40 * granted, provided that the above copyright notice appears in all 41 * copies, and that both that copyright notice and this permission notice 42 * appear in supporting documentation, and that the name of Secret Labs 43 * AB or the author not be used in advertising or publicity pertaining to 44 * distribution of the software without specific, written prior 45 * permission. 46 * 47 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 48 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 49 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 50 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 51 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 52 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 53 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 54 * -------------------------------------------------------------------- */ 55 56 /* === Internal API ======================================================= */ 57 58 /* --- Internal Unicode Format -------------------------------------------- */ 59 60 /* Python 3.x requires unicode */ 61 #define Py_USING_UNICODE 62 63 #ifndef SIZEOF_WCHAR_T 64 #error Must define SIZEOF_WCHAR_T 65 #endif 66 67 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 68 69 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 70 Otherwise, Unicode strings are stored as UCS-2 (with limited support 71 for UTF-16) */ 72 73 #if Py_UNICODE_SIZE >= 4 74 #define Py_UNICODE_WIDE 75 #endif 76 77 /* Set these flags if the platform has "wchar.h" and the 78 wchar_t type is a 16-bit unsigned type */ 79 /* #define HAVE_WCHAR_H */ 80 /* #define HAVE_USABLE_WCHAR_T */ 81 82 /* If the compiler provides a wchar_t type we try to support it 83 through the interface functions PyUnicode_FromWideChar(), 84 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 85 86 #ifdef HAVE_USABLE_WCHAR_T 87 # ifndef HAVE_WCHAR_H 88 # define HAVE_WCHAR_H 89 # endif 90 #endif 91 92 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 93 unicode representations. */ 94 typedef uint32_t Py_UCS4; 95 typedef uint16_t Py_UCS2; 96 typedef uint8_t Py_UCS1; 97 98 #ifdef __cplusplus 99 extern "C" { 100 #endif 101 102 103 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 104 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 105 106 #define PyUnicode_Check(op) \ 107 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 108 #define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type) 109 110 /* --- Constants ---------------------------------------------------------- */ 111 112 /* This Unicode character will be used as replacement character during 113 decoding if the errors argument is set to "replace". Note: the 114 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 115 Unicode 3.0. */ 116 117 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 118 119 /* === Public API ========================================================= */ 120 121 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 122 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 123 const char *u, /* UTF-8 encoded string */ 124 Py_ssize_t size /* size of buffer */ 125 ); 126 127 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 128 UTF-8 encoded bytes. The size is determined with strlen(). */ 129 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 130 const char *u /* UTF-8 encoded string */ 131 ); 132 133 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 134 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 135 PyObject *str, 136 Py_ssize_t start, 137 Py_ssize_t end); 138 #endif 139 140 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 141 /* Copy the string into a UCS4 buffer including the null character if copy_null 142 is set. Return NULL and raise an exception on error. Raise a SystemError if 143 the buffer is smaller than the string. Return buffer on success. 144 145 buflen is the length of the buffer in (Py_UCS4) characters. */ 146 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 147 PyObject *unicode, 148 Py_UCS4* buffer, 149 Py_ssize_t buflen, 150 int copy_null); 151 152 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 153 * PyMem_Malloc; if this fails, NULL is returned with a memory error 154 exception set. */ 155 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 156 #endif 157 158 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 159 /* Get the length of the Unicode object. */ 160 161 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 162 PyObject *unicode 163 ); 164 #endif 165 166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 167 /* Read a character from the string. */ 168 169 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 170 PyObject *unicode, 171 Py_ssize_t index 172 ); 173 174 /* Write a character to the string. The string must have been created through 175 PyUnicode_New, must not be shared, and must not have been hashed yet. 176 177 Return 0 on success, -1 on error. */ 178 179 PyAPI_FUNC(int) PyUnicode_WriteChar( 180 PyObject *unicode, 181 Py_ssize_t index, 182 Py_UCS4 character 183 ); 184 #endif 185 186 /* Resize a Unicode object. The length is the number of codepoints. 187 188 *unicode is modified to point to the new (resized) object and 0 189 returned on success. 190 191 Try to resize the string in place (which is usually faster than allocating 192 a new string and copy characters), or create a new string. 193 194 Error handling is implemented as follows: an exception is set, -1 195 is returned and *unicode left untouched. 196 197 WARNING: The function doesn't check string content, the result may not be a 198 string in canonical representation. */ 199 200 PyAPI_FUNC(int) PyUnicode_Resize( 201 PyObject **unicode, /* Pointer to the Unicode object */ 202 Py_ssize_t length /* New length */ 203 ); 204 205 /* Decode obj to a Unicode object. 206 207 bytes, bytearray and other bytes-like objects are decoded according to the 208 given encoding and error handler. The encoding and error handler can be 209 NULL to have the interface use UTF-8 and "strict". 210 211 All other objects (including Unicode objects) raise an exception. 212 213 The API returns NULL in case of an error. The caller is responsible 214 for decref'ing the returned objects. 215 216 */ 217 218 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 219 PyObject *obj, /* Object */ 220 const char *encoding, /* encoding */ 221 const char *errors /* error handling */ 222 ); 223 224 /* Copy an instance of a Unicode subtype to a new true Unicode object if 225 necessary. If obj is already a true Unicode object (not a subtype), return 226 the reference with *incremented* refcount. 227 228 The API returns NULL in case of an error. The caller is responsible 229 for decref'ing the returned objects. 230 231 */ 232 233 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 234 PyObject *obj /* Object */ 235 ); 236 237 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 238 const char *format, /* ASCII-encoded string */ 239 va_list vargs 240 ); 241 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 242 const char *format, /* ASCII-encoded string */ 243 ... 244 ); 245 246 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 247 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 248 const char *u /* UTF-8 encoded string */ 249 ); 250 251 /* --- wchar_t support for platforms which support it --------------------- */ 252 253 #ifdef HAVE_WCHAR_H 254 255 /* Create a Unicode Object from the wchar_t buffer w of the given 256 size. 257 258 The buffer is copied into the new object. */ 259 260 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 261 const wchar_t *w, /* wchar_t buffer */ 262 Py_ssize_t size /* size of buffer */ 263 ); 264 265 /* Copies the Unicode Object contents into the wchar_t buffer w. At 266 most size wchar_t characters are copied. 267 268 Note that the resulting wchar_t string may or may not be 269 0-terminated. It is the responsibility of the caller to make sure 270 that the wchar_t string is 0-terminated in case this is required by 271 the application. 272 273 Returns the number of wchar_t characters copied (excluding a 274 possibly trailing 0-termination character) or -1 in case of an 275 error. */ 276 277 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 278 PyObject *unicode, /* Unicode object */ 279 wchar_t *w, /* wchar_t buffer */ 280 Py_ssize_t size /* size of buffer */ 281 ); 282 283 /* Convert the Unicode object to a wide character string. The output string 284 always ends with a nul character. If size is not NULL, write the number of 285 wide characters (excluding the null character) into *size. 286 287 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 288 on success. On error, returns NULL, *size is undefined and raises a 289 MemoryError. */ 290 291 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 292 PyObject *unicode, /* Unicode object */ 293 Py_ssize_t *size /* number of characters of the result */ 294 ); 295 296 #endif 297 298 /* --- Unicode ordinals --------------------------------------------------- */ 299 300 /* Create a Unicode Object from the given Unicode code point ordinal. 301 302 The ordinal must be in range(0x110000). A ValueError is 303 raised in case it is not. 304 305 */ 306 307 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 308 309 /* === Builtin Codecs ===================================================== 310 311 Many of these APIs take two arguments encoding and errors. These 312 parameters encoding and errors have the same semantics as the ones 313 of the builtin str() API. 314 315 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 316 317 Error handling is set by errors which may also be set to NULL 318 meaning to use the default handling defined for the codec. Default 319 error handling for all builtin codecs is "strict" (ValueErrors are 320 raised). 321 322 The codecs all use a similar interface. Only deviation from the 323 generic ones are documented. 324 325 */ 326 327 /* --- Manage the default encoding ---------------------------------------- */ 328 329 /* Returns "utf-8". */ 330 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 331 332 /* --- Generic Codecs ----------------------------------------------------- */ 333 334 /* Create a Unicode object by decoding the encoded string s of the 335 given size. */ 336 337 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 338 const char *s, /* encoded string */ 339 Py_ssize_t size, /* size of buffer */ 340 const char *encoding, /* encoding */ 341 const char *errors /* error handling */ 342 ); 343 344 /* Decode a Unicode object unicode and return the result as Python 345 object. 346 347 This API is DEPRECATED. The only supported standard encoding is rot13. 348 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 349 that decode from str. */ 350 351 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 352 PyObject *unicode, /* Unicode object */ 353 const char *encoding, /* encoding */ 354 const char *errors /* error handling */ 355 ); 356 357 /* Decode a Unicode object unicode and return the result as Unicode 358 object. 359 360 This API is DEPRECATED. The only supported standard encoding is rot13. 361 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 362 that decode from str to str. */ 363 364 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 365 PyObject *unicode, /* Unicode object */ 366 const char *encoding, /* encoding */ 367 const char *errors /* error handling */ 368 ); 369 370 /* Encodes a Unicode object and returns the result as Python 371 object. 372 373 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 374 since all standard encodings (except rot13) encode str to bytes. 375 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 376 that encode form str to non-bytes. */ 377 378 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 379 PyObject *unicode, /* Unicode object */ 380 const char *encoding, /* encoding */ 381 const char *errors /* error handling */ 382 ); 383 384 /* Encodes a Unicode object and returns the result as Python string 385 object. */ 386 387 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 388 PyObject *unicode, /* Unicode object */ 389 const char *encoding, /* encoding */ 390 const char *errors /* error handling */ 391 ); 392 393 /* Encodes a Unicode object and returns the result as Unicode 394 object. 395 396 This API is DEPRECATED. The only supported standard encodings is rot13. 397 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 398 that encode from str to str. */ 399 400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 401 PyObject *unicode, /* Unicode object */ 402 const char *encoding, /* encoding */ 403 const char *errors /* error handling */ 404 ); 405 406 /* Build an encoding map. */ 407 408 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 409 PyObject* string /* 256 character map */ 410 ); 411 412 /* --- UTF-7 Codecs ------------------------------------------------------- */ 413 414 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 415 const char *string, /* UTF-7 encoded string */ 416 Py_ssize_t length, /* size of string */ 417 const char *errors /* error handling */ 418 ); 419 420 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 421 const char *string, /* UTF-7 encoded string */ 422 Py_ssize_t length, /* size of string */ 423 const char *errors, /* error handling */ 424 Py_ssize_t *consumed /* bytes consumed */ 425 ); 426 427 /* --- UTF-8 Codecs ------------------------------------------------------- */ 428 429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 430 const char *string, /* UTF-8 encoded string */ 431 Py_ssize_t length, /* size of string */ 432 const char *errors /* error handling */ 433 ); 434 435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 436 const char *string, /* UTF-8 encoded string */ 437 Py_ssize_t length, /* size of string */ 438 const char *errors, /* error handling */ 439 Py_ssize_t *consumed /* bytes consumed */ 440 ); 441 442 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 443 PyObject *unicode /* Unicode object */ 444 ); 445 446 /* Returns a pointer to the default encoding (UTF-8) of the 447 Unicode object unicode and the size of the encoded representation 448 in bytes stored in *size. 449 450 In case of an error, no *size is set. 451 452 This function caches the UTF-8 encoded string in the unicodeobject 453 and subsequent calls will return the same string. The memory is released 454 when the unicodeobject is deallocated. 455 */ 456 457 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 458 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( 459 PyObject *unicode, 460 Py_ssize_t *size); 461 #endif 462 463 /* --- UTF-32 Codecs ------------------------------------------------------ */ 464 465 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 466 the corresponding Unicode object. 467 468 errors (if non-NULL) defines the error handling. It defaults 469 to "strict". 470 471 If byteorder is non-NULL, the decoder starts decoding using the 472 given byte order: 473 474 *byteorder == -1: little endian 475 *byteorder == 0: native order 476 *byteorder == 1: big endian 477 478 In native mode, the first four bytes of the stream are checked for a 479 BOM mark. If found, the BOM mark is analysed, the byte order 480 adjusted and the BOM skipped. In the other modes, no BOM mark 481 interpretation is done. After completion, *byteorder is set to the 482 current byte order at the end of input data. 483 484 If byteorder is NULL, the codec starts in native order mode. 485 486 */ 487 488 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 489 const char *string, /* UTF-32 encoded string */ 490 Py_ssize_t length, /* size of string */ 491 const char *errors, /* error handling */ 492 int *byteorder /* pointer to byteorder to use 493 0=native;-1=LE,1=BE; updated on 494 exit */ 495 ); 496 497 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 498 const char *string, /* UTF-32 encoded string */ 499 Py_ssize_t length, /* size of string */ 500 const char *errors, /* error handling */ 501 int *byteorder, /* pointer to byteorder to use 502 0=native;-1=LE,1=BE; updated on 503 exit */ 504 Py_ssize_t *consumed /* bytes consumed */ 505 ); 506 507 /* Returns a Python string using the UTF-32 encoding in native byte 508 order. The string always starts with a BOM mark. */ 509 510 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 511 PyObject *unicode /* Unicode object */ 512 ); 513 514 /* Returns a Python string object holding the UTF-32 encoded value of 515 the Unicode data. 516 517 If byteorder is not 0, output is written according to the following 518 byte order: 519 520 byteorder == -1: little endian 521 byteorder == 0: native byte order (writes a BOM mark) 522 byteorder == 1: big endian 523 524 If byteorder is 0, the output string will always start with the 525 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 526 prepended. 527 528 */ 529 530 /* --- UTF-16 Codecs ------------------------------------------------------ */ 531 532 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 533 the corresponding Unicode object. 534 535 errors (if non-NULL) defines the error handling. It defaults 536 to "strict". 537 538 If byteorder is non-NULL, the decoder starts decoding using the 539 given byte order: 540 541 *byteorder == -1: little endian 542 *byteorder == 0: native order 543 *byteorder == 1: big endian 544 545 In native mode, the first two bytes of the stream are checked for a 546 BOM mark. If found, the BOM mark is analysed, the byte order 547 adjusted and the BOM skipped. In the other modes, no BOM mark 548 interpretation is done. After completion, *byteorder is set to the 549 current byte order at the end of input data. 550 551 If byteorder is NULL, the codec starts in native order mode. 552 553 */ 554 555 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 556 const char *string, /* UTF-16 encoded string */ 557 Py_ssize_t length, /* size of string */ 558 const char *errors, /* error handling */ 559 int *byteorder /* pointer to byteorder to use 560 0=native;-1=LE,1=BE; updated on 561 exit */ 562 ); 563 564 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 565 const char *string, /* UTF-16 encoded string */ 566 Py_ssize_t length, /* size of string */ 567 const char *errors, /* error handling */ 568 int *byteorder, /* pointer to byteorder to use 569 0=native;-1=LE,1=BE; updated on 570 exit */ 571 Py_ssize_t *consumed /* bytes consumed */ 572 ); 573 574 /* Returns a Python string using the UTF-16 encoding in native byte 575 order. The string always starts with a BOM mark. */ 576 577 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 578 PyObject *unicode /* Unicode object */ 579 ); 580 581 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 582 583 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 584 const char *string, /* Unicode-Escape encoded string */ 585 Py_ssize_t length, /* size of string */ 586 const char *errors /* error handling */ 587 ); 588 589 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 590 PyObject *unicode /* Unicode object */ 591 ); 592 593 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 594 595 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 596 const char *string, /* Raw-Unicode-Escape encoded string */ 597 Py_ssize_t length, /* size of string */ 598 const char *errors /* error handling */ 599 ); 600 601 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 602 PyObject *unicode /* Unicode object */ 603 ); 604 605 /* --- Latin-1 Codecs ----------------------------------------------------- 606 607 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 608 609 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 610 const char *string, /* Latin-1 encoded string */ 611 Py_ssize_t length, /* size of string */ 612 const char *errors /* error handling */ 613 ); 614 615 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 616 PyObject *unicode /* Unicode object */ 617 ); 618 619 /* --- ASCII Codecs ------------------------------------------------------- 620 621 Only 7-bit ASCII data is expected. All other codes generate errors. 622 623 */ 624 625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 626 const char *string, /* ASCII encoded string */ 627 Py_ssize_t length, /* size of string */ 628 const char *errors /* error handling */ 629 ); 630 631 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 632 PyObject *unicode /* Unicode object */ 633 ); 634 635 /* --- Character Map Codecs ----------------------------------------------- 636 637 This codec uses mappings to encode and decode characters. 638 639 Decoding mappings must map byte ordinals (integers in the range from 0 to 640 255) to Unicode strings, integers (which are then interpreted as Unicode 641 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 642 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 643 mapping" and cause an error. 644 645 Encoding mappings must map Unicode ordinal integers to bytes objects, 646 integers in the range from 0 to 255 or None. Unmapped character 647 ordinals (ones which cause a LookupError) as well as mapped to 648 None are treated as "undefined mapping" and cause an error. 649 650 */ 651 652 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 653 const char *string, /* Encoded string */ 654 Py_ssize_t length, /* size of string */ 655 PyObject *mapping, /* decoding mapping */ 656 const char *errors /* error handling */ 657 ); 658 659 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 660 PyObject *unicode, /* Unicode object */ 661 PyObject *mapping /* encoding mapping */ 662 ); 663 664 /* --- MBCS codecs for Windows -------------------------------------------- */ 665 666 #ifdef MS_WINDOWS 667 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 668 const char *string, /* MBCS encoded string */ 669 Py_ssize_t length, /* size of string */ 670 const char *errors /* error handling */ 671 ); 672 673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 674 const char *string, /* MBCS encoded string */ 675 Py_ssize_t length, /* size of string */ 676 const char *errors, /* error handling */ 677 Py_ssize_t *consumed /* bytes consumed */ 678 ); 679 680 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 681 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 682 int code_page, /* code page number */ 683 const char *string, /* encoded string */ 684 Py_ssize_t length, /* size of string */ 685 const char *errors, /* error handling */ 686 Py_ssize_t *consumed /* bytes consumed */ 687 ); 688 #endif 689 690 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 691 PyObject *unicode /* Unicode object */ 692 ); 693 694 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 695 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 696 int code_page, /* code page number */ 697 PyObject *unicode, /* Unicode object */ 698 const char *errors /* error handling */ 699 ); 700 #endif 701 702 #endif /* MS_WINDOWS */ 703 704 /* --- Locale encoding --------------------------------------------------- */ 705 706 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 707 /* Decode a string from the current locale encoding. The decoder is strict if 708 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 709 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 710 be decoded as a surrogate character and *surrogateescape* is not equal to 711 zero, the byte sequence is escaped using the 'surrogateescape' error handler 712 instead of being decoded. *str* must end with a null character but cannot 713 contain embedded null characters. */ 714 715 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 716 const char *str, 717 Py_ssize_t len, 718 const char *errors); 719 720 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 721 length using strlen(). */ 722 723 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 724 const char *str, 725 const char *errors); 726 727 /* Encode a Unicode object to the current locale encoding. The encoder is 728 strict is *surrogateescape* is equal to zero, otherwise the 729 "surrogateescape" error handler is used. Return a bytes object. The string 730 cannot contain embedded null characters. */ 731 732 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 733 PyObject *unicode, 734 const char *errors 735 ); 736 #endif 737 738 /* --- File system encoding ---------------------------------------------- */ 739 740 /* ParseTuple converter: encode str objects to bytes using 741 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 742 743 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 744 745 /* ParseTuple converter: decode bytes objects to unicode using 746 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 747 748 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 749 750 /* Decode a null-terminated string from the Python filesystem encoding 751 and error handler. 752 753 If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */ 754 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 755 const char *s /* encoded string */ 756 ); 757 758 /* Decode a string from the Python filesystem encoding and error handler. */ 759 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 760 const char *s, /* encoded string */ 761 Py_ssize_t size /* size */ 762 ); 763 764 /* Encode a Unicode object to the Python filesystem encoding and error handler. 765 Return bytes. */ 766 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 767 PyObject *unicode 768 ); 769 770 /* --- Methods & Slots ---------------------------------------------------- 771 772 These are capable of handling Unicode objects and strings on input 773 (we refer to them as strings in the descriptions) and return 774 Unicode objects or integers as appropriate. */ 775 776 /* Concat two strings giving a new Unicode string. */ 777 778 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 779 PyObject *left, /* Left string */ 780 PyObject *right /* Right string */ 781 ); 782 783 /* Concat two strings and put the result in *pleft 784 (sets *pleft to NULL on error) */ 785 786 PyAPI_FUNC(void) PyUnicode_Append( 787 PyObject **pleft, /* Pointer to left string */ 788 PyObject *right /* Right string */ 789 ); 790 791 /* Concat two strings, put the result in *pleft and drop the right object 792 (sets *pleft to NULL on error) */ 793 794 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 795 PyObject **pleft, /* Pointer to left string */ 796 PyObject *right /* Right string */ 797 ); 798 799 /* Split a string giving a list of Unicode strings. 800 801 If sep is NULL, splitting will be done at all whitespace 802 substrings. Otherwise, splits occur at the given separator. 803 804 At most maxsplit splits will be done. If negative, no limit is set. 805 806 Separators are not included in the resulting list. 807 808 */ 809 810 PyAPI_FUNC(PyObject*) PyUnicode_Split( 811 PyObject *s, /* String to split */ 812 PyObject *sep, /* String separator */ 813 Py_ssize_t maxsplit /* Maxsplit count */ 814 ); 815 816 /* Dito, but split at line breaks. 817 818 CRLF is considered to be one line break. Line breaks are not 819 included in the resulting list. */ 820 821 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 822 PyObject *s, /* String to split */ 823 int keepends /* If true, line end markers are included */ 824 ); 825 826 /* Partition a string using a given separator. */ 827 828 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 829 PyObject *s, /* String to partition */ 830 PyObject *sep /* String separator */ 831 ); 832 833 /* Partition a string using a given separator, searching from the end of the 834 string. */ 835 836 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 837 PyObject *s, /* String to partition */ 838 PyObject *sep /* String separator */ 839 ); 840 841 /* Split a string giving a list of Unicode strings. 842 843 If sep is NULL, splitting will be done at all whitespace 844 substrings. Otherwise, splits occur at the given separator. 845 846 At most maxsplit splits will be done. But unlike PyUnicode_Split 847 PyUnicode_RSplit splits from the end of the string. If negative, 848 no limit is set. 849 850 Separators are not included in the resulting list. 851 852 */ 853 854 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 855 PyObject *s, /* String to split */ 856 PyObject *sep, /* String separator */ 857 Py_ssize_t maxsplit /* Maxsplit count */ 858 ); 859 860 /* Translate a string by applying a character mapping table to it and 861 return the resulting Unicode object. 862 863 The mapping table must map Unicode ordinal integers to Unicode strings, 864 Unicode ordinal integers or None (causing deletion of the character). 865 866 Mapping tables may be dictionaries or sequences. Unmapped character 867 ordinals (ones which cause a LookupError) are left untouched and 868 are copied as-is. 869 870 */ 871 872 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 873 PyObject *str, /* String */ 874 PyObject *table, /* Translate table */ 875 const char *errors /* error handling */ 876 ); 877 878 /* Join a sequence of strings using the given separator and return 879 the resulting Unicode string. */ 880 881 PyAPI_FUNC(PyObject*) PyUnicode_Join( 882 PyObject *separator, /* Separator string */ 883 PyObject *seq /* Sequence object */ 884 ); 885 886 /* Return 1 if substr matches str[start:end] at the given tail end, 0 887 otherwise. */ 888 889 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 890 PyObject *str, /* String */ 891 PyObject *substr, /* Prefix or Suffix string */ 892 Py_ssize_t start, /* Start index */ 893 Py_ssize_t end, /* Stop index */ 894 int direction /* Tail end: -1 prefix, +1 suffix */ 895 ); 896 897 /* Return the first position of substr in str[start:end] using the 898 given search direction or -1 if not found. -2 is returned in case 899 an error occurred and an exception is set. */ 900 901 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 902 PyObject *str, /* String */ 903 PyObject *substr, /* Substring to find */ 904 Py_ssize_t start, /* Start index */ 905 Py_ssize_t end, /* Stop index */ 906 int direction /* Find direction: +1 forward, -1 backward */ 907 ); 908 909 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 910 /* Like PyUnicode_Find, but search for single character only. */ 911 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 912 PyObject *str, 913 Py_UCS4 ch, 914 Py_ssize_t start, 915 Py_ssize_t end, 916 int direction 917 ); 918 #endif 919 920 /* Count the number of occurrences of substr in str[start:end]. */ 921 922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 923 PyObject *str, /* String */ 924 PyObject *substr, /* Substring to count */ 925 Py_ssize_t start, /* Start index */ 926 Py_ssize_t end /* Stop index */ 927 ); 928 929 /* Replace at most maxcount occurrences of substr in str with replstr 930 and return the resulting Unicode object. */ 931 932 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 933 PyObject *str, /* String */ 934 PyObject *substr, /* Substring to find */ 935 PyObject *replstr, /* Substring to replace */ 936 Py_ssize_t maxcount /* Max. number of replacements to apply; 937 -1 = all */ 938 ); 939 940 /* Compare two strings and return -1, 0, 1 for less than, equal, 941 greater than resp. 942 Raise an exception and return -1 on error. */ 943 944 PyAPI_FUNC(int) PyUnicode_Compare( 945 PyObject *left, /* Left string */ 946 PyObject *right /* Right string */ 947 ); 948 949 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 950 equal, and greater than, respectively. It is best to pass only 951 ASCII-encoded strings, but the function interprets the input string as 952 ISO-8859-1 if it contains non-ASCII characters. 953 This function does not raise exceptions. */ 954 955 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 956 PyObject *left, 957 const char *right /* ASCII-encoded string */ 958 ); 959 960 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000 961 /* Compare a Unicode object with UTF-8 encoded C string. 962 Return 1 if they are equal, or 0 otherwise. 963 This function does not raise exceptions. */ 964 965 PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *); 966 PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t); 967 #endif 968 969 /* Rich compare two strings and return one of the following: 970 971 - NULL in case an exception was raised 972 - Py_True or Py_False for successful comparisons 973 - Py_NotImplemented in case the type combination is unknown 974 975 Possible values for op: 976 977 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 978 979 */ 980 981 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 982 PyObject *left, /* Left string */ 983 PyObject *right, /* Right string */ 984 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 985 ); 986 987 /* Apply an argument tuple or dictionary to a format string and return 988 the resulting Unicode string. */ 989 990 PyAPI_FUNC(PyObject *) PyUnicode_Format( 991 PyObject *format, /* Format string */ 992 PyObject *args /* Argument tuple or dictionary */ 993 ); 994 995 /* Checks whether element is contained in container and return 1/0 996 accordingly. 997 998 element has to coerce to a one element Unicode string. -1 is 999 returned in case of an error. */ 1000 1001 PyAPI_FUNC(int) PyUnicode_Contains( 1002 PyObject *container, /* Container string */ 1003 PyObject *element /* Element string */ 1004 ); 1005 1006 /* Checks whether argument is a valid identifier. */ 1007 1008 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1009 1010 /* === Characters Type APIs =============================================== */ 1011 1012 #ifndef Py_LIMITED_API 1013 # define Py_CPYTHON_UNICODEOBJECT_H 1014 # include "cpython/unicodeobject.h" 1015 # undef Py_CPYTHON_UNICODEOBJECT_H 1016 #endif 1017 1018 #ifdef __cplusplus 1019 } 1020 #endif 1021 #endif /* !Py_UNICODEOBJECT_H */ 1022