1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 #include <stdarg.h> 5 6 /* 7 8 Unicode implementation based on original code by Fredrik Lundh, 9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10 Unicode Integration Proposal. (See 11 http://www.egenix.com/files/python/unicode-proposal.txt). 12 13 Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58 #include <ctype.h> 59 60 /* === Internal API ======================================================= */ 61 62 /* --- Internal Unicode Format -------------------------------------------- */ 63 64 /* Python 3.x requires unicode */ 65 #define Py_USING_UNICODE 66 67 #ifndef SIZEOF_WCHAR_T 68 #error Must define SIZEOF_WCHAR_T 69 #endif 70 71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77 #if Py_UNICODE_SIZE >= 4 78 #define Py_UNICODE_WIDE 79 #endif 80 81 /* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83 /* #define HAVE_WCHAR_H */ 84 /* #define HAVE_USABLE_WCHAR_T */ 85 86 /* If the compiler provides a wchar_t type we try to support it 87 through the interface functions PyUnicode_FromWideChar(), 88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 89 90 #ifdef HAVE_USABLE_WCHAR_T 91 # ifndef HAVE_WCHAR_H 92 # define HAVE_WCHAR_H 93 # endif 94 #endif 95 96 #ifdef HAVE_WCHAR_H 97 # include <wchar.h> 98 #endif 99 100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 101 unicode representations. */ 102 typedef uint32_t Py_UCS4; 103 typedef uint16_t Py_UCS2; 104 typedef uint8_t Py_UCS1; 105 106 #ifdef __cplusplus 107 extern "C" { 108 #endif 109 110 111 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 113 114 #define PyUnicode_Check(op) \ 115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 116 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 117 118 /* --- Constants ---------------------------------------------------------- */ 119 120 /* This Unicode character will be used as replacement character during 121 decoding if the errors argument is set to "replace". Note: the 122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 123 Unicode 3.0. */ 124 125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 126 127 /* === Public API ========================================================= */ 128 129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 131 const char *u, /* UTF-8 encoded string */ 132 Py_ssize_t size /* size of buffer */ 133 ); 134 135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 136 UTF-8 encoded bytes. The size is determined with strlen(). */ 137 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 138 const char *u /* UTF-8 encoded string */ 139 ); 140 141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 142 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 143 PyObject *str, 144 Py_ssize_t start, 145 Py_ssize_t end); 146 #endif 147 148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 149 /* Copy the string into a UCS4 buffer including the null character if copy_null 150 is set. Return NULL and raise an exception on error. Raise a SystemError if 151 the buffer is smaller than the string. Return buffer on success. 152 153 buflen is the length of the buffer in (Py_UCS4) characters. */ 154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 155 PyObject *unicode, 156 Py_UCS4* buffer, 157 Py_ssize_t buflen, 158 int copy_null); 159 160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 161 * PyMem_Malloc; if this fails, NULL is returned with a memory error 162 exception set. */ 163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 164 #endif 165 166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 167 /* Get the length of the Unicode object. */ 168 169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 170 PyObject *unicode 171 ); 172 #endif 173 174 /* Get the number of Py_UNICODE units in the 175 string representation. */ 176 177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 178 PyObject *unicode /* Unicode object */ 179 ); 180 181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 182 /* Read a character from the string. */ 183 184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 185 PyObject *unicode, 186 Py_ssize_t index 187 ); 188 189 /* Write a character to the string. The string must have been created through 190 PyUnicode_New, must not be shared, and must not have been hashed yet. 191 192 Return 0 on success, -1 on error. */ 193 194 PyAPI_FUNC(int) PyUnicode_WriteChar( 195 PyObject *unicode, 196 Py_ssize_t index, 197 Py_UCS4 character 198 ); 199 #endif 200 201 /* Resize a Unicode object. The length is the number of characters, except 202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 203 is the number of Py_UNICODE characters. 204 205 *unicode is modified to point to the new (resized) object and 0 206 returned on success. 207 208 Try to resize the string in place (which is usually faster than allocating 209 a new string and copy characters), or create a new string. 210 211 Error handling is implemented as follows: an exception is set, -1 212 is returned and *unicode left untouched. 213 214 WARNING: The function doesn't check string content, the result may not be a 215 string in canonical representation. */ 216 217 PyAPI_FUNC(int) PyUnicode_Resize( 218 PyObject **unicode, /* Pointer to the Unicode object */ 219 Py_ssize_t length /* New length */ 220 ); 221 222 /* Decode obj to a Unicode object. 223 224 bytes, bytearray and other bytes-like objects are decoded according to the 225 given encoding and error handler. The encoding and error handler can be 226 NULL to have the interface use UTF-8 and "strict". 227 228 All other objects (including Unicode objects) raise an exception. 229 230 The API returns NULL in case of an error. The caller is responsible 231 for decref'ing the returned objects. 232 233 */ 234 235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 236 PyObject *obj, /* Object */ 237 const char *encoding, /* encoding */ 238 const char *errors /* error handling */ 239 ); 240 241 /* Copy an instance of a Unicode subtype to a new true Unicode object if 242 necessary. If obj is already a true Unicode object (not a subtype), return 243 the reference with *incremented* refcount. 244 245 The API returns NULL in case of an error. The caller is responsible 246 for decref'ing the returned objects. 247 248 */ 249 250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 251 PyObject *obj /* Object */ 252 ); 253 254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 255 const char *format, /* ASCII-encoded string */ 256 va_list vargs 257 ); 258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 259 const char *format, /* ASCII-encoded string */ 260 ... 261 ); 262 263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 264 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 265 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 266 const char *u /* UTF-8 encoded string */ 267 ); 268 269 /* Use only if you know it's a string */ 270 #define PyUnicode_CHECK_INTERNED(op) \ 271 (((PyASCIIObject *)(op))->state.interned) 272 273 /* --- wchar_t support for platforms which support it --------------------- */ 274 275 #ifdef HAVE_WCHAR_H 276 277 /* Create a Unicode Object from the wchar_t buffer w of the given 278 size. 279 280 The buffer is copied into the new object. */ 281 282 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 283 const wchar_t *w, /* wchar_t buffer */ 284 Py_ssize_t size /* size of buffer */ 285 ); 286 287 /* Copies the Unicode Object contents into the wchar_t buffer w. At 288 most size wchar_t characters are copied. 289 290 Note that the resulting wchar_t string may or may not be 291 0-terminated. It is the responsibility of the caller to make sure 292 that the wchar_t string is 0-terminated in case this is required by 293 the application. 294 295 Returns the number of wchar_t characters copied (excluding a 296 possibly trailing 0-termination character) or -1 in case of an 297 error. */ 298 299 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 300 PyObject *unicode, /* Unicode object */ 301 wchar_t *w, /* wchar_t buffer */ 302 Py_ssize_t size /* size of buffer */ 303 ); 304 305 /* Convert the Unicode object to a wide character string. The output string 306 always ends with a nul character. If size is not NULL, write the number of 307 wide characters (excluding the null character) into *size. 308 309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 310 on success. On error, returns NULL, *size is undefined and raises a 311 MemoryError. */ 312 313 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 314 PyObject *unicode, /* Unicode object */ 315 Py_ssize_t *size /* number of characters of the result */ 316 ); 317 318 #endif 319 320 /* --- Unicode ordinals --------------------------------------------------- */ 321 322 /* Create a Unicode Object from the given Unicode code point ordinal. 323 324 The ordinal must be in range(0x110000). A ValueError is 325 raised in case it is not. 326 327 */ 328 329 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 330 331 /* --- Free-list management ----------------------------------------------- */ 332 333 /* Clear the free list used by the Unicode implementation. 334 335 This can be used to release memory used for objects on the free 336 list back to the Python memory allocator. 337 338 */ 339 340 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 341 342 /* === Builtin Codecs ===================================================== 343 344 Many of these APIs take two arguments encoding and errors. These 345 parameters encoding and errors have the same semantics as the ones 346 of the builtin str() API. 347 348 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 349 350 Error handling is set by errors which may also be set to NULL 351 meaning to use the default handling defined for the codec. Default 352 error handling for all builtin codecs is "strict" (ValueErrors are 353 raised). 354 355 The codecs all use a similar interface. Only deviation from the 356 generic ones are documented. 357 358 */ 359 360 /* --- Manage the default encoding ---------------------------------------- */ 361 362 /* Returns "utf-8". */ 363 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 364 365 /* --- Generic Codecs ----------------------------------------------------- */ 366 367 /* Create a Unicode object by decoding the encoded string s of the 368 given size. */ 369 370 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 371 const char *s, /* encoded string */ 372 Py_ssize_t size, /* size of buffer */ 373 const char *encoding, /* encoding */ 374 const char *errors /* error handling */ 375 ); 376 377 /* Decode a Unicode object unicode and return the result as Python 378 object. 379 380 This API is DEPRECATED. The only supported standard encoding is rot13. 381 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 382 that decode from str. */ 383 384 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 385 PyObject *unicode, /* Unicode object */ 386 const char *encoding, /* encoding */ 387 const char *errors /* error handling */ 388 ); 389 390 /* Decode a Unicode object unicode and return the result as Unicode 391 object. 392 393 This API is DEPRECATED. The only supported standard encoding is rot13. 394 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 395 that decode from str to str. */ 396 397 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 398 PyObject *unicode, /* Unicode object */ 399 const char *encoding, /* encoding */ 400 const char *errors /* error handling */ 401 ); 402 403 /* Encodes a Unicode object and returns the result as Python 404 object. 405 406 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 407 since all standard encodings (except rot13) encode str to bytes. 408 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 409 that encode form str to non-bytes. */ 410 411 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 412 PyObject *unicode, /* Unicode object */ 413 const char *encoding, /* encoding */ 414 const char *errors /* error handling */ 415 ); 416 417 /* Encodes a Unicode object and returns the result as Python string 418 object. */ 419 420 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 421 PyObject *unicode, /* Unicode object */ 422 const char *encoding, /* encoding */ 423 const char *errors /* error handling */ 424 ); 425 426 /* Encodes a Unicode object and returns the result as Unicode 427 object. 428 429 This API is DEPRECATED. The only supported standard encodings is rot13. 430 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 431 that encode from str to str. */ 432 433 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 434 PyObject *unicode, /* Unicode object */ 435 const char *encoding, /* encoding */ 436 const char *errors /* error handling */ 437 ); 438 439 /* Build an encoding map. */ 440 441 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 442 PyObject* string /* 256 character map */ 443 ); 444 445 /* --- UTF-7 Codecs ------------------------------------------------------- */ 446 447 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 448 const char *string, /* UTF-7 encoded string */ 449 Py_ssize_t length, /* size of string */ 450 const char *errors /* error handling */ 451 ); 452 453 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 454 const char *string, /* UTF-7 encoded string */ 455 Py_ssize_t length, /* size of string */ 456 const char *errors, /* error handling */ 457 Py_ssize_t *consumed /* bytes consumed */ 458 ); 459 460 /* --- UTF-8 Codecs ------------------------------------------------------- */ 461 462 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 463 const char *string, /* UTF-8 encoded string */ 464 Py_ssize_t length, /* size of string */ 465 const char *errors /* error handling */ 466 ); 467 468 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 469 const char *string, /* UTF-8 encoded string */ 470 Py_ssize_t length, /* size of string */ 471 const char *errors, /* error handling */ 472 Py_ssize_t *consumed /* bytes consumed */ 473 ); 474 475 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 476 PyObject *unicode /* Unicode object */ 477 ); 478 479 /* --- UTF-32 Codecs ------------------------------------------------------ */ 480 481 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 482 the corresponding Unicode object. 483 484 errors (if non-NULL) defines the error handling. It defaults 485 to "strict". 486 487 If byteorder is non-NULL, the decoder starts decoding using the 488 given byte order: 489 490 *byteorder == -1: little endian 491 *byteorder == 0: native order 492 *byteorder == 1: big endian 493 494 In native mode, the first four bytes of the stream are checked for a 495 BOM mark. If found, the BOM mark is analysed, the byte order 496 adjusted and the BOM skipped. In the other modes, no BOM mark 497 interpretation is done. After completion, *byteorder is set to the 498 current byte order at the end of input data. 499 500 If byteorder is NULL, the codec starts in native order mode. 501 502 */ 503 504 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 505 const char *string, /* UTF-32 encoded string */ 506 Py_ssize_t length, /* size of string */ 507 const char *errors, /* error handling */ 508 int *byteorder /* pointer to byteorder to use 509 0=native;-1=LE,1=BE; updated on 510 exit */ 511 ); 512 513 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 514 const char *string, /* UTF-32 encoded string */ 515 Py_ssize_t length, /* size of string */ 516 const char *errors, /* error handling */ 517 int *byteorder, /* pointer to byteorder to use 518 0=native;-1=LE,1=BE; updated on 519 exit */ 520 Py_ssize_t *consumed /* bytes consumed */ 521 ); 522 523 /* Returns a Python string using the UTF-32 encoding in native byte 524 order. The string always starts with a BOM mark. */ 525 526 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 527 PyObject *unicode /* Unicode object */ 528 ); 529 530 /* Returns a Python string object holding the UTF-32 encoded value of 531 the Unicode data. 532 533 If byteorder is not 0, output is written according to the following 534 byte order: 535 536 byteorder == -1: little endian 537 byteorder == 0: native byte order (writes a BOM mark) 538 byteorder == 1: big endian 539 540 If byteorder is 0, the output string will always start with the 541 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 542 prepended. 543 544 */ 545 546 /* --- UTF-16 Codecs ------------------------------------------------------ */ 547 548 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 549 the corresponding Unicode object. 550 551 errors (if non-NULL) defines the error handling. It defaults 552 to "strict". 553 554 If byteorder is non-NULL, the decoder starts decoding using the 555 given byte order: 556 557 *byteorder == -1: little endian 558 *byteorder == 0: native order 559 *byteorder == 1: big endian 560 561 In native mode, the first two bytes of the stream are checked for a 562 BOM mark. If found, the BOM mark is analysed, the byte order 563 adjusted and the BOM skipped. In the other modes, no BOM mark 564 interpretation is done. After completion, *byteorder is set to the 565 current byte order at the end of input data. 566 567 If byteorder is NULL, the codec starts in native order mode. 568 569 */ 570 571 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 572 const char *string, /* UTF-16 encoded string */ 573 Py_ssize_t length, /* size of string */ 574 const char *errors, /* error handling */ 575 int *byteorder /* pointer to byteorder to use 576 0=native;-1=LE,1=BE; updated on 577 exit */ 578 ); 579 580 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 581 const char *string, /* UTF-16 encoded string */ 582 Py_ssize_t length, /* size of string */ 583 const char *errors, /* error handling */ 584 int *byteorder, /* pointer to byteorder to use 585 0=native;-1=LE,1=BE; updated on 586 exit */ 587 Py_ssize_t *consumed /* bytes consumed */ 588 ); 589 590 /* Returns a Python string using the UTF-16 encoding in native byte 591 order. The string always starts with a BOM mark. */ 592 593 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 594 PyObject *unicode /* Unicode object */ 595 ); 596 597 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 598 599 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 600 const char *string, /* Unicode-Escape encoded string */ 601 Py_ssize_t length, /* size of string */ 602 const char *errors /* error handling */ 603 ); 604 605 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 606 PyObject *unicode /* Unicode object */ 607 ); 608 609 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 610 611 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 612 const char *string, /* Raw-Unicode-Escape encoded string */ 613 Py_ssize_t length, /* size of string */ 614 const char *errors /* error handling */ 615 ); 616 617 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 618 PyObject *unicode /* Unicode object */ 619 ); 620 621 /* --- Latin-1 Codecs ----------------------------------------------------- 622 623 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 624 625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 626 const char *string, /* Latin-1 encoded string */ 627 Py_ssize_t length, /* size of string */ 628 const char *errors /* error handling */ 629 ); 630 631 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 632 PyObject *unicode /* Unicode object */ 633 ); 634 635 /* --- ASCII Codecs ------------------------------------------------------- 636 637 Only 7-bit ASCII data is excepted. All other codes generate errors. 638 639 */ 640 641 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 642 const char *string, /* ASCII encoded string */ 643 Py_ssize_t length, /* size of string */ 644 const char *errors /* error handling */ 645 ); 646 647 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 648 PyObject *unicode /* Unicode object */ 649 ); 650 651 /* --- Character Map Codecs ----------------------------------------------- 652 653 This codec uses mappings to encode and decode characters. 654 655 Decoding mappings must map byte ordinals (integers in the range from 0 to 656 255) to Unicode strings, integers (which are then interpreted as Unicode 657 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 658 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 659 mapping" and cause an error. 660 661 Encoding mappings must map Unicode ordinal integers to bytes objects, 662 integers in the range from 0 to 255 or None. Unmapped character 663 ordinals (ones which cause a LookupError) as well as mapped to 664 None are treated as "undefined mapping" and cause an error. 665 666 */ 667 668 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 669 const char *string, /* Encoded string */ 670 Py_ssize_t length, /* size of string */ 671 PyObject *mapping, /* decoding mapping */ 672 const char *errors /* error handling */ 673 ); 674 675 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 676 PyObject *unicode, /* Unicode object */ 677 PyObject *mapping /* encoding mapping */ 678 ); 679 680 /* --- MBCS codecs for Windows -------------------------------------------- */ 681 682 #ifdef MS_WINDOWS 683 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 684 const char *string, /* MBCS encoded string */ 685 Py_ssize_t length, /* size of string */ 686 const char *errors /* error handling */ 687 ); 688 689 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 690 const char *string, /* MBCS encoded string */ 691 Py_ssize_t length, /* size of string */ 692 const char *errors, /* error handling */ 693 Py_ssize_t *consumed /* bytes consumed */ 694 ); 695 696 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 697 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 698 int code_page, /* code page number */ 699 const char *string, /* encoded string */ 700 Py_ssize_t length, /* size of string */ 701 const char *errors, /* error handling */ 702 Py_ssize_t *consumed /* bytes consumed */ 703 ); 704 #endif 705 706 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 707 PyObject *unicode /* Unicode object */ 708 ); 709 710 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 711 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 712 int code_page, /* code page number */ 713 PyObject *unicode, /* Unicode object */ 714 const char *errors /* error handling */ 715 ); 716 #endif 717 718 #endif /* MS_WINDOWS */ 719 720 /* --- Locale encoding --------------------------------------------------- */ 721 722 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 723 /* Decode a string from the current locale encoding. The decoder is strict if 724 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 725 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 726 be decoded as a surrogate character and *surrogateescape* is not equal to 727 zero, the byte sequence is escaped using the 'surrogateescape' error handler 728 instead of being decoded. *str* must end with a null character but cannot 729 contain embedded null characters. */ 730 731 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 732 const char *str, 733 Py_ssize_t len, 734 const char *errors); 735 736 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 737 length using strlen(). */ 738 739 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 740 const char *str, 741 const char *errors); 742 743 /* Encode a Unicode object to the current locale encoding. The encoder is 744 strict is *surrogateescape* is equal to zero, otherwise the 745 "surrogateescape" error handler is used. Return a bytes object. The string 746 cannot contain embedded null characters. */ 747 748 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 749 PyObject *unicode, 750 const char *errors 751 ); 752 #endif 753 754 /* --- File system encoding ---------------------------------------------- */ 755 756 /* ParseTuple converter: encode str objects to bytes using 757 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 758 759 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 760 761 /* ParseTuple converter: decode bytes objects to unicode using 762 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 763 764 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 765 766 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding 767 and the "surrogateescape" error handler. 768 769 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 770 encoding. 771 772 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 773 */ 774 775 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 776 const char *s /* encoded string */ 777 ); 778 779 /* Decode a string using Py_FileSystemDefaultEncoding 780 and the "surrogateescape" error handler. 781 782 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 783 encoding. 784 */ 785 786 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 787 const char *s, /* encoded string */ 788 Py_ssize_t size /* size */ 789 ); 790 791 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 792 "surrogateescape" error handler, and return bytes. 793 794 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 795 encoding. 796 */ 797 798 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 799 PyObject *unicode 800 ); 801 802 /* --- Methods & Slots ---------------------------------------------------- 803 804 These are capable of handling Unicode objects and strings on input 805 (we refer to them as strings in the descriptions) and return 806 Unicode objects or integers as appropriate. */ 807 808 /* Concat two strings giving a new Unicode string. */ 809 810 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 811 PyObject *left, /* Left string */ 812 PyObject *right /* Right string */ 813 ); 814 815 /* Concat two strings and put the result in *pleft 816 (sets *pleft to NULL on error) */ 817 818 PyAPI_FUNC(void) PyUnicode_Append( 819 PyObject **pleft, /* Pointer to left string */ 820 PyObject *right /* Right string */ 821 ); 822 823 /* Concat two strings, put the result in *pleft and drop the right object 824 (sets *pleft to NULL on error) */ 825 826 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 827 PyObject **pleft, /* Pointer to left string */ 828 PyObject *right /* Right string */ 829 ); 830 831 /* Split a string giving a list of Unicode strings. 832 833 If sep is NULL, splitting will be done at all whitespace 834 substrings. Otherwise, splits occur at the given separator. 835 836 At most maxsplit splits will be done. If negative, no limit is set. 837 838 Separators are not included in the resulting list. 839 840 */ 841 842 PyAPI_FUNC(PyObject*) PyUnicode_Split( 843 PyObject *s, /* String to split */ 844 PyObject *sep, /* String separator */ 845 Py_ssize_t maxsplit /* Maxsplit count */ 846 ); 847 848 /* Dito, but split at line breaks. 849 850 CRLF is considered to be one line break. Line breaks are not 851 included in the resulting list. */ 852 853 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 854 PyObject *s, /* String to split */ 855 int keepends /* If true, line end markers are included */ 856 ); 857 858 /* Partition a string using a given separator. */ 859 860 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 861 PyObject *s, /* String to partition */ 862 PyObject *sep /* String separator */ 863 ); 864 865 /* Partition a string using a given separator, searching from the end of the 866 string. */ 867 868 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 869 PyObject *s, /* String to partition */ 870 PyObject *sep /* String separator */ 871 ); 872 873 /* Split a string giving a list of Unicode strings. 874 875 If sep is NULL, splitting will be done at all whitespace 876 substrings. Otherwise, splits occur at the given separator. 877 878 At most maxsplit splits will be done. But unlike PyUnicode_Split 879 PyUnicode_RSplit splits from the end of the string. If negative, 880 no limit is set. 881 882 Separators are not included in the resulting list. 883 884 */ 885 886 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 887 PyObject *s, /* String to split */ 888 PyObject *sep, /* String separator */ 889 Py_ssize_t maxsplit /* Maxsplit count */ 890 ); 891 892 /* Translate a string by applying a character mapping table to it and 893 return the resulting Unicode object. 894 895 The mapping table must map Unicode ordinal integers to Unicode strings, 896 Unicode ordinal integers or None (causing deletion of the character). 897 898 Mapping tables may be dictionaries or sequences. Unmapped character 899 ordinals (ones which cause a LookupError) are left untouched and 900 are copied as-is. 901 902 */ 903 904 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 905 PyObject *str, /* String */ 906 PyObject *table, /* Translate table */ 907 const char *errors /* error handling */ 908 ); 909 910 /* Join a sequence of strings using the given separator and return 911 the resulting Unicode string. */ 912 913 PyAPI_FUNC(PyObject*) PyUnicode_Join( 914 PyObject *separator, /* Separator string */ 915 PyObject *seq /* Sequence object */ 916 ); 917 918 /* Return 1 if substr matches str[start:end] at the given tail end, 0 919 otherwise. */ 920 921 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 922 PyObject *str, /* String */ 923 PyObject *substr, /* Prefix or Suffix string */ 924 Py_ssize_t start, /* Start index */ 925 Py_ssize_t end, /* Stop index */ 926 int direction /* Tail end: -1 prefix, +1 suffix */ 927 ); 928 929 /* Return the first position of substr in str[start:end] using the 930 given search direction or -1 if not found. -2 is returned in case 931 an error occurred and an exception is set. */ 932 933 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 934 PyObject *str, /* String */ 935 PyObject *substr, /* Substring to find */ 936 Py_ssize_t start, /* Start index */ 937 Py_ssize_t end, /* Stop index */ 938 int direction /* Find direction: +1 forward, -1 backward */ 939 ); 940 941 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 942 /* Like PyUnicode_Find, but search for single character only. */ 943 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 944 PyObject *str, 945 Py_UCS4 ch, 946 Py_ssize_t start, 947 Py_ssize_t end, 948 int direction 949 ); 950 #endif 951 952 /* Count the number of occurrences of substr in str[start:end]. */ 953 954 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 955 PyObject *str, /* String */ 956 PyObject *substr, /* Substring to count */ 957 Py_ssize_t start, /* Start index */ 958 Py_ssize_t end /* Stop index */ 959 ); 960 961 /* Replace at most maxcount occurrences of substr in str with replstr 962 and return the resulting Unicode object. */ 963 964 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 965 PyObject *str, /* String */ 966 PyObject *substr, /* Substring to find */ 967 PyObject *replstr, /* Substring to replace */ 968 Py_ssize_t maxcount /* Max. number of replacements to apply; 969 -1 = all */ 970 ); 971 972 /* Compare two strings and return -1, 0, 1 for less than, equal, 973 greater than resp. 974 Raise an exception and return -1 on error. */ 975 976 PyAPI_FUNC(int) PyUnicode_Compare( 977 PyObject *left, /* Left string */ 978 PyObject *right /* Right string */ 979 ); 980 981 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 982 equal, and greater than, respectively. It is best to pass only 983 ASCII-encoded strings, but the function interprets the input string as 984 ISO-8859-1 if it contains non-ASCII characters. 985 This function does not raise exceptions. */ 986 987 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 988 PyObject *left, 989 const char *right /* ASCII-encoded string */ 990 ); 991 992 /* Rich compare two strings and return one of the following: 993 994 - NULL in case an exception was raised 995 - Py_True or Py_False for successful comparisons 996 - Py_NotImplemented in case the type combination is unknown 997 998 Possible values for op: 999 1000 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1001 1002 */ 1003 1004 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1005 PyObject *left, /* Left string */ 1006 PyObject *right, /* Right string */ 1007 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1008 ); 1009 1010 /* Apply an argument tuple or dictionary to a format string and return 1011 the resulting Unicode string. */ 1012 1013 PyAPI_FUNC(PyObject *) PyUnicode_Format( 1014 PyObject *format, /* Format string */ 1015 PyObject *args /* Argument tuple or dictionary */ 1016 ); 1017 1018 /* Checks whether element is contained in container and return 1/0 1019 accordingly. 1020 1021 element has to coerce to a one element Unicode string. -1 is 1022 returned in case of an error. */ 1023 1024 PyAPI_FUNC(int) PyUnicode_Contains( 1025 PyObject *container, /* Container string */ 1026 PyObject *element /* Element string */ 1027 ); 1028 1029 /* Checks whether argument is a valid identifier. */ 1030 1031 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1032 1033 /* === Characters Type APIs =============================================== */ 1034 1035 #ifndef Py_LIMITED_API 1036 # define Py_CPYTHON_UNICODEOBJECT_H 1037 # include "cpython/unicodeobject.h" 1038 # undef Py_CPYTHON_UNICODEOBJECT_H 1039 #endif 1040 1041 #ifdef __cplusplus 1042 } 1043 #endif 1044 #endif /* !Py_UNICODEOBJECT_H */ 1045