1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 #include <stdarg.h> 5 6 /* 7 8 Unicode implementation based on original code by Fredrik Lundh, 9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10 Unicode Integration Proposal. (See 11 http://www.egenix.com/files/python/unicode-proposal.txt). 12 13 Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58 #include <ctype.h> 59 60 /* === Internal API ======================================================= */ 61 62 /* --- Internal Unicode Format -------------------------------------------- */ 63 64 /* Python 3.x requires unicode */ 65 #define Py_USING_UNICODE 66 67 #ifndef SIZEOF_WCHAR_T 68 #error Must define SIZEOF_WCHAR_T 69 #endif 70 71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77 #if Py_UNICODE_SIZE >= 4 78 #define Py_UNICODE_WIDE 79 #endif 80 81 /* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83 /* #define HAVE_WCHAR_H */ 84 /* #define HAVE_USABLE_WCHAR_T */ 85 86 /* If the compiler provides a wchar_t type we try to support it 87 through the interface functions PyUnicode_FromWideChar(), 88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 89 90 #ifdef HAVE_USABLE_WCHAR_T 91 # ifndef HAVE_WCHAR_H 92 # define HAVE_WCHAR_H 93 # endif 94 #endif 95 96 #ifdef HAVE_WCHAR_H 97 # include <wchar.h> 98 #endif 99 100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 101 unicode representations. */ 102 typedef uint32_t Py_UCS4; 103 typedef uint16_t Py_UCS2; 104 typedef uint8_t Py_UCS1; 105 106 #ifdef __cplusplus 107 extern "C" { 108 #endif 109 110 111 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 113 114 #define PyUnicode_Check(op) \ 115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type) 117 118 /* --- Constants ---------------------------------------------------------- */ 119 120 /* This Unicode character will be used as replacement character during 121 decoding if the errors argument is set to "replace". Note: the 122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 123 Unicode 3.0. */ 124 125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 126 127 /* === Public API ========================================================= */ 128 129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 131 const char *u, /* UTF-8 encoded string */ 132 Py_ssize_t size /* size of buffer */ 133 ); 134 135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 136 UTF-8 encoded bytes. The size is determined with strlen(). */ 137 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 138 const char *u /* UTF-8 encoded string */ 139 ); 140 141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 142 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 143 PyObject *str, 144 Py_ssize_t start, 145 Py_ssize_t end); 146 #endif 147 148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 149 /* Copy the string into a UCS4 buffer including the null character if copy_null 150 is set. Return NULL and raise an exception on error. Raise a SystemError if 151 the buffer is smaller than the string. Return buffer on success. 152 153 buflen is the length of the buffer in (Py_UCS4) characters. */ 154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 155 PyObject *unicode, 156 Py_UCS4* buffer, 157 Py_ssize_t buflen, 158 int copy_null); 159 160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 161 * PyMem_Malloc; if this fails, NULL is returned with a memory error 162 exception set. */ 163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 164 #endif 165 166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 167 /* Get the length of the Unicode object. */ 168 169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 170 PyObject *unicode 171 ); 172 #endif 173 174 /* Get the number of Py_UNICODE units in the 175 string representation. */ 176 177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 178 PyObject *unicode /* Unicode object */ 179 ); 180 181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 182 /* Read a character from the string. */ 183 184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 185 PyObject *unicode, 186 Py_ssize_t index 187 ); 188 189 /* Write a character to the string. The string must have been created through 190 PyUnicode_New, must not be shared, and must not have been hashed yet. 191 192 Return 0 on success, -1 on error. */ 193 194 PyAPI_FUNC(int) PyUnicode_WriteChar( 195 PyObject *unicode, 196 Py_ssize_t index, 197 Py_UCS4 character 198 ); 199 #endif 200 201 /* Resize a Unicode object. The length is the number of characters, except 202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 203 is the number of Py_UNICODE characters. 204 205 *unicode is modified to point to the new (resized) object and 0 206 returned on success. 207 208 Try to resize the string in place (which is usually faster than allocating 209 a new string and copy characters), or create a new string. 210 211 Error handling is implemented as follows: an exception is set, -1 212 is returned and *unicode left untouched. 213 214 WARNING: The function doesn't check string content, the result may not be a 215 string in canonical representation. */ 216 217 PyAPI_FUNC(int) PyUnicode_Resize( 218 PyObject **unicode, /* Pointer to the Unicode object */ 219 Py_ssize_t length /* New length */ 220 ); 221 222 /* Decode obj to a Unicode object. 223 224 bytes, bytearray and other bytes-like objects are decoded according to the 225 given encoding and error handler. The encoding and error handler can be 226 NULL to have the interface use UTF-8 and "strict". 227 228 All other objects (including Unicode objects) raise an exception. 229 230 The API returns NULL in case of an error. The caller is responsible 231 for decref'ing the returned objects. 232 233 */ 234 235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 236 PyObject *obj, /* Object */ 237 const char *encoding, /* encoding */ 238 const char *errors /* error handling */ 239 ); 240 241 /* Copy an instance of a Unicode subtype to a new true Unicode object if 242 necessary. If obj is already a true Unicode object (not a subtype), return 243 the reference with *incremented* refcount. 244 245 The API returns NULL in case of an error. The caller is responsible 246 for decref'ing the returned objects. 247 248 */ 249 250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 251 PyObject *obj /* Object */ 252 ); 253 254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 255 const char *format, /* ASCII-encoded string */ 256 va_list vargs 257 ); 258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 259 const char *format, /* ASCII-encoded string */ 260 ... 261 ); 262 263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 264 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 265 const char *u /* UTF-8 encoded string */ 266 ); 267 268 // PyUnicode_InternImmortal() is deprecated since Python 3.10 269 // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead. 270 Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 271 272 /* Use only if you know it's a string */ 273 #define PyUnicode_CHECK_INTERNED(op) \ 274 (((PyASCIIObject *)(op))->state.interned) 275 276 /* --- wchar_t support for platforms which support it --------------------- */ 277 278 #ifdef HAVE_WCHAR_H 279 280 /* Create a Unicode Object from the wchar_t buffer w of the given 281 size. 282 283 The buffer is copied into the new object. */ 284 285 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 286 const wchar_t *w, /* wchar_t buffer */ 287 Py_ssize_t size /* size of buffer */ 288 ); 289 290 /* Copies the Unicode Object contents into the wchar_t buffer w. At 291 most size wchar_t characters are copied. 292 293 Note that the resulting wchar_t string may or may not be 294 0-terminated. It is the responsibility of the caller to make sure 295 that the wchar_t string is 0-terminated in case this is required by 296 the application. 297 298 Returns the number of wchar_t characters copied (excluding a 299 possibly trailing 0-termination character) or -1 in case of an 300 error. */ 301 302 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 303 PyObject *unicode, /* Unicode object */ 304 wchar_t *w, /* wchar_t buffer */ 305 Py_ssize_t size /* size of buffer */ 306 ); 307 308 /* Convert the Unicode object to a wide character string. The output string 309 always ends with a nul character. If size is not NULL, write the number of 310 wide characters (excluding the null character) into *size. 311 312 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 313 on success. On error, returns NULL, *size is undefined and raises a 314 MemoryError. */ 315 316 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 317 PyObject *unicode, /* Unicode object */ 318 Py_ssize_t *size /* number of characters of the result */ 319 ); 320 321 #endif 322 323 /* --- Unicode ordinals --------------------------------------------------- */ 324 325 /* Create a Unicode Object from the given Unicode code point ordinal. 326 327 The ordinal must be in range(0x110000). A ValueError is 328 raised in case it is not. 329 330 */ 331 332 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 333 334 /* === Builtin Codecs ===================================================== 335 336 Many of these APIs take two arguments encoding and errors. These 337 parameters encoding and errors have the same semantics as the ones 338 of the builtin str() API. 339 340 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 341 342 Error handling is set by errors which may also be set to NULL 343 meaning to use the default handling defined for the codec. Default 344 error handling for all builtin codecs is "strict" (ValueErrors are 345 raised). 346 347 The codecs all use a similar interface. Only deviation from the 348 generic ones are documented. 349 350 */ 351 352 /* --- Manage the default encoding ---------------------------------------- */ 353 354 /* Returns "utf-8". */ 355 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 356 357 /* --- Generic Codecs ----------------------------------------------------- */ 358 359 /* Create a Unicode object by decoding the encoded string s of the 360 given size. */ 361 362 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 363 const char *s, /* encoded string */ 364 Py_ssize_t size, /* size of buffer */ 365 const char *encoding, /* encoding */ 366 const char *errors /* error handling */ 367 ); 368 369 /* Decode a Unicode object unicode and return the result as Python 370 object. 371 372 This API is DEPRECATED. The only supported standard encoding is rot13. 373 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 374 that decode from str. */ 375 376 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 377 PyObject *unicode, /* Unicode object */ 378 const char *encoding, /* encoding */ 379 const char *errors /* error handling */ 380 ); 381 382 /* Decode a Unicode object unicode and return the result as Unicode 383 object. 384 385 This API is DEPRECATED. The only supported standard encoding is rot13. 386 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 387 that decode from str to str. */ 388 389 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 390 PyObject *unicode, /* Unicode object */ 391 const char *encoding, /* encoding */ 392 const char *errors /* error handling */ 393 ); 394 395 /* Encodes a Unicode object and returns the result as Python 396 object. 397 398 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 399 since all standard encodings (except rot13) encode str to bytes. 400 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 401 that encode form str to non-bytes. */ 402 403 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 404 PyObject *unicode, /* Unicode object */ 405 const char *encoding, /* encoding */ 406 const char *errors /* error handling */ 407 ); 408 409 /* Encodes a Unicode object and returns the result as Python string 410 object. */ 411 412 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 413 PyObject *unicode, /* Unicode object */ 414 const char *encoding, /* encoding */ 415 const char *errors /* error handling */ 416 ); 417 418 /* Encodes a Unicode object and returns the result as Unicode 419 object. 420 421 This API is DEPRECATED. The only supported standard encodings is rot13. 422 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 423 that encode from str to str. */ 424 425 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 426 PyObject *unicode, /* Unicode object */ 427 const char *encoding, /* encoding */ 428 const char *errors /* error handling */ 429 ); 430 431 /* Build an encoding map. */ 432 433 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 434 PyObject* string /* 256 character map */ 435 ); 436 437 /* --- UTF-7 Codecs ------------------------------------------------------- */ 438 439 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 440 const char *string, /* UTF-7 encoded string */ 441 Py_ssize_t length, /* size of string */ 442 const char *errors /* error handling */ 443 ); 444 445 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 446 const char *string, /* UTF-7 encoded string */ 447 Py_ssize_t length, /* size of string */ 448 const char *errors, /* error handling */ 449 Py_ssize_t *consumed /* bytes consumed */ 450 ); 451 452 /* --- UTF-8 Codecs ------------------------------------------------------- */ 453 454 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 455 const char *string, /* UTF-8 encoded string */ 456 Py_ssize_t length, /* size of string */ 457 const char *errors /* error handling */ 458 ); 459 460 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 461 const char *string, /* UTF-8 encoded string */ 462 Py_ssize_t length, /* size of string */ 463 const char *errors, /* error handling */ 464 Py_ssize_t *consumed /* bytes consumed */ 465 ); 466 467 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 468 PyObject *unicode /* Unicode object */ 469 ); 470 471 /* Returns a pointer to the default encoding (UTF-8) of the 472 Unicode object unicode and the size of the encoded representation 473 in bytes stored in *size. 474 475 In case of an error, no *size is set. 476 477 This function caches the UTF-8 encoded string in the unicodeobject 478 and subsequent calls will return the same string. The memory is released 479 when the unicodeobject is deallocated. 480 */ 481 482 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 483 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( 484 PyObject *unicode, 485 Py_ssize_t *size); 486 #endif 487 488 /* --- UTF-32 Codecs ------------------------------------------------------ */ 489 490 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 491 the corresponding Unicode object. 492 493 errors (if non-NULL) defines the error handling. It defaults 494 to "strict". 495 496 If byteorder is non-NULL, the decoder starts decoding using the 497 given byte order: 498 499 *byteorder == -1: little endian 500 *byteorder == 0: native order 501 *byteorder == 1: big endian 502 503 In native mode, the first four bytes of the stream are checked for a 504 BOM mark. If found, the BOM mark is analysed, the byte order 505 adjusted and the BOM skipped. In the other modes, no BOM mark 506 interpretation is done. After completion, *byteorder is set to the 507 current byte order at the end of input data. 508 509 If byteorder is NULL, the codec starts in native order mode. 510 511 */ 512 513 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 514 const char *string, /* UTF-32 encoded string */ 515 Py_ssize_t length, /* size of string */ 516 const char *errors, /* error handling */ 517 int *byteorder /* pointer to byteorder to use 518 0=native;-1=LE,1=BE; updated on 519 exit */ 520 ); 521 522 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 523 const char *string, /* UTF-32 encoded string */ 524 Py_ssize_t length, /* size of string */ 525 const char *errors, /* error handling */ 526 int *byteorder, /* pointer to byteorder to use 527 0=native;-1=LE,1=BE; updated on 528 exit */ 529 Py_ssize_t *consumed /* bytes consumed */ 530 ); 531 532 /* Returns a Python string using the UTF-32 encoding in native byte 533 order. The string always starts with a BOM mark. */ 534 535 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 536 PyObject *unicode /* Unicode object */ 537 ); 538 539 /* Returns a Python string object holding the UTF-32 encoded value of 540 the Unicode data. 541 542 If byteorder is not 0, output is written according to the following 543 byte order: 544 545 byteorder == -1: little endian 546 byteorder == 0: native byte order (writes a BOM mark) 547 byteorder == 1: big endian 548 549 If byteorder is 0, the output string will always start with the 550 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 551 prepended. 552 553 */ 554 555 /* --- UTF-16 Codecs ------------------------------------------------------ */ 556 557 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 558 the corresponding Unicode object. 559 560 errors (if non-NULL) defines the error handling. It defaults 561 to "strict". 562 563 If byteorder is non-NULL, the decoder starts decoding using the 564 given byte order: 565 566 *byteorder == -1: little endian 567 *byteorder == 0: native order 568 *byteorder == 1: big endian 569 570 In native mode, the first two bytes of the stream are checked for a 571 BOM mark. If found, the BOM mark is analysed, the byte order 572 adjusted and the BOM skipped. In the other modes, no BOM mark 573 interpretation is done. After completion, *byteorder is set to the 574 current byte order at the end of input data. 575 576 If byteorder is NULL, the codec starts in native order mode. 577 578 */ 579 580 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 581 const char *string, /* UTF-16 encoded string */ 582 Py_ssize_t length, /* size of string */ 583 const char *errors, /* error handling */ 584 int *byteorder /* pointer to byteorder to use 585 0=native;-1=LE,1=BE; updated on 586 exit */ 587 ); 588 589 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 590 const char *string, /* UTF-16 encoded string */ 591 Py_ssize_t length, /* size of string */ 592 const char *errors, /* error handling */ 593 int *byteorder, /* pointer to byteorder to use 594 0=native;-1=LE,1=BE; updated on 595 exit */ 596 Py_ssize_t *consumed /* bytes consumed */ 597 ); 598 599 /* Returns a Python string using the UTF-16 encoding in native byte 600 order. The string always starts with a BOM mark. */ 601 602 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 603 PyObject *unicode /* Unicode object */ 604 ); 605 606 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 607 608 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 609 const char *string, /* Unicode-Escape encoded string */ 610 Py_ssize_t length, /* size of string */ 611 const char *errors /* error handling */ 612 ); 613 614 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 615 PyObject *unicode /* Unicode object */ 616 ); 617 618 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 619 620 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 621 const char *string, /* Raw-Unicode-Escape encoded string */ 622 Py_ssize_t length, /* size of string */ 623 const char *errors /* error handling */ 624 ); 625 626 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 627 PyObject *unicode /* Unicode object */ 628 ); 629 630 /* --- Latin-1 Codecs ----------------------------------------------------- 631 632 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 633 634 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 635 const char *string, /* Latin-1 encoded string */ 636 Py_ssize_t length, /* size of string */ 637 const char *errors /* error handling */ 638 ); 639 640 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 641 PyObject *unicode /* Unicode object */ 642 ); 643 644 /* --- ASCII Codecs ------------------------------------------------------- 645 646 Only 7-bit ASCII data is excepted. All other codes generate errors. 647 648 */ 649 650 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 651 const char *string, /* ASCII encoded string */ 652 Py_ssize_t length, /* size of string */ 653 const char *errors /* error handling */ 654 ); 655 656 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 657 PyObject *unicode /* Unicode object */ 658 ); 659 660 /* --- Character Map Codecs ----------------------------------------------- 661 662 This codec uses mappings to encode and decode characters. 663 664 Decoding mappings must map byte ordinals (integers in the range from 0 to 665 255) to Unicode strings, integers (which are then interpreted as Unicode 666 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 667 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 668 mapping" and cause an error. 669 670 Encoding mappings must map Unicode ordinal integers to bytes objects, 671 integers in the range from 0 to 255 or None. Unmapped character 672 ordinals (ones which cause a LookupError) as well as mapped to 673 None are treated as "undefined mapping" and cause an error. 674 675 */ 676 677 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 678 const char *string, /* Encoded string */ 679 Py_ssize_t length, /* size of string */ 680 PyObject *mapping, /* decoding mapping */ 681 const char *errors /* error handling */ 682 ); 683 684 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 685 PyObject *unicode, /* Unicode object */ 686 PyObject *mapping /* encoding mapping */ 687 ); 688 689 /* --- MBCS codecs for Windows -------------------------------------------- */ 690 691 #ifdef MS_WINDOWS 692 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 693 const char *string, /* MBCS encoded string */ 694 Py_ssize_t length, /* size of string */ 695 const char *errors /* error handling */ 696 ); 697 698 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 699 const char *string, /* MBCS encoded string */ 700 Py_ssize_t length, /* size of string */ 701 const char *errors, /* error handling */ 702 Py_ssize_t *consumed /* bytes consumed */ 703 ); 704 705 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 706 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 707 int code_page, /* code page number */ 708 const char *string, /* encoded string */ 709 Py_ssize_t length, /* size of string */ 710 const char *errors, /* error handling */ 711 Py_ssize_t *consumed /* bytes consumed */ 712 ); 713 #endif 714 715 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 716 PyObject *unicode /* Unicode object */ 717 ); 718 719 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 720 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 721 int code_page, /* code page number */ 722 PyObject *unicode, /* Unicode object */ 723 const char *errors /* error handling */ 724 ); 725 #endif 726 727 #endif /* MS_WINDOWS */ 728 729 /* --- Locale encoding --------------------------------------------------- */ 730 731 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 732 /* Decode a string from the current locale encoding. The decoder is strict if 733 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 734 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 735 be decoded as a surrogate character and *surrogateescape* is not equal to 736 zero, the byte sequence is escaped using the 'surrogateescape' error handler 737 instead of being decoded. *str* must end with a null character but cannot 738 contain embedded null characters. */ 739 740 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 741 const char *str, 742 Py_ssize_t len, 743 const char *errors); 744 745 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 746 length using strlen(). */ 747 748 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 749 const char *str, 750 const char *errors); 751 752 /* Encode a Unicode object to the current locale encoding. The encoder is 753 strict is *surrogateescape* is equal to zero, otherwise the 754 "surrogateescape" error handler is used. Return a bytes object. The string 755 cannot contain embedded null characters. */ 756 757 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 758 PyObject *unicode, 759 const char *errors 760 ); 761 #endif 762 763 /* --- File system encoding ---------------------------------------------- */ 764 765 /* ParseTuple converter: encode str objects to bytes using 766 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 767 768 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 769 770 /* ParseTuple converter: decode bytes objects to unicode using 771 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 772 773 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 774 775 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding 776 and the "surrogateescape" error handler. 777 778 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 779 encoding. 780 781 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 782 */ 783 784 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 785 const char *s /* encoded string */ 786 ); 787 788 /* Decode a string using Py_FileSystemDefaultEncoding 789 and the "surrogateescape" error handler. 790 791 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 792 encoding. 793 */ 794 795 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 796 const char *s, /* encoded string */ 797 Py_ssize_t size /* size */ 798 ); 799 800 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 801 "surrogateescape" error handler, and return bytes. 802 803 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 804 encoding. 805 */ 806 807 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 808 PyObject *unicode 809 ); 810 811 /* --- Methods & Slots ---------------------------------------------------- 812 813 These are capable of handling Unicode objects and strings on input 814 (we refer to them as strings in the descriptions) and return 815 Unicode objects or integers as appropriate. */ 816 817 /* Concat two strings giving a new Unicode string. */ 818 819 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 820 PyObject *left, /* Left string */ 821 PyObject *right /* Right string */ 822 ); 823 824 /* Concat two strings and put the result in *pleft 825 (sets *pleft to NULL on error) */ 826 827 PyAPI_FUNC(void) PyUnicode_Append( 828 PyObject **pleft, /* Pointer to left string */ 829 PyObject *right /* Right string */ 830 ); 831 832 /* Concat two strings, put the result in *pleft and drop the right object 833 (sets *pleft to NULL on error) */ 834 835 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 836 PyObject **pleft, /* Pointer to left string */ 837 PyObject *right /* Right string */ 838 ); 839 840 /* Split a string giving a list of Unicode strings. 841 842 If sep is NULL, splitting will be done at all whitespace 843 substrings. Otherwise, splits occur at the given separator. 844 845 At most maxsplit splits will be done. If negative, no limit is set. 846 847 Separators are not included in the resulting list. 848 849 */ 850 851 PyAPI_FUNC(PyObject*) PyUnicode_Split( 852 PyObject *s, /* String to split */ 853 PyObject *sep, /* String separator */ 854 Py_ssize_t maxsplit /* Maxsplit count */ 855 ); 856 857 /* Dito, but split at line breaks. 858 859 CRLF is considered to be one line break. Line breaks are not 860 included in the resulting list. */ 861 862 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 863 PyObject *s, /* String to split */ 864 int keepends /* If true, line end markers are included */ 865 ); 866 867 /* Partition a string using a given separator. */ 868 869 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 870 PyObject *s, /* String to partition */ 871 PyObject *sep /* String separator */ 872 ); 873 874 /* Partition a string using a given separator, searching from the end of the 875 string. */ 876 877 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 878 PyObject *s, /* String to partition */ 879 PyObject *sep /* String separator */ 880 ); 881 882 /* Split a string giving a list of Unicode strings. 883 884 If sep is NULL, splitting will be done at all whitespace 885 substrings. Otherwise, splits occur at the given separator. 886 887 At most maxsplit splits will be done. But unlike PyUnicode_Split 888 PyUnicode_RSplit splits from the end of the string. If negative, 889 no limit is set. 890 891 Separators are not included in the resulting list. 892 893 */ 894 895 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 896 PyObject *s, /* String to split */ 897 PyObject *sep, /* String separator */ 898 Py_ssize_t maxsplit /* Maxsplit count */ 899 ); 900 901 /* Translate a string by applying a character mapping table to it and 902 return the resulting Unicode object. 903 904 The mapping table must map Unicode ordinal integers to Unicode strings, 905 Unicode ordinal integers or None (causing deletion of the character). 906 907 Mapping tables may be dictionaries or sequences. Unmapped character 908 ordinals (ones which cause a LookupError) are left untouched and 909 are copied as-is. 910 911 */ 912 913 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 914 PyObject *str, /* String */ 915 PyObject *table, /* Translate table */ 916 const char *errors /* error handling */ 917 ); 918 919 /* Join a sequence of strings using the given separator and return 920 the resulting Unicode string. */ 921 922 PyAPI_FUNC(PyObject*) PyUnicode_Join( 923 PyObject *separator, /* Separator string */ 924 PyObject *seq /* Sequence object */ 925 ); 926 927 /* Return 1 if substr matches str[start:end] at the given tail end, 0 928 otherwise. */ 929 930 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 931 PyObject *str, /* String */ 932 PyObject *substr, /* Prefix or Suffix string */ 933 Py_ssize_t start, /* Start index */ 934 Py_ssize_t end, /* Stop index */ 935 int direction /* Tail end: -1 prefix, +1 suffix */ 936 ); 937 938 /* Return the first position of substr in str[start:end] using the 939 given search direction or -1 if not found. -2 is returned in case 940 an error occurred and an exception is set. */ 941 942 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 943 PyObject *str, /* String */ 944 PyObject *substr, /* Substring to find */ 945 Py_ssize_t start, /* Start index */ 946 Py_ssize_t end, /* Stop index */ 947 int direction /* Find direction: +1 forward, -1 backward */ 948 ); 949 950 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 951 /* Like PyUnicode_Find, but search for single character only. */ 952 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 953 PyObject *str, 954 Py_UCS4 ch, 955 Py_ssize_t start, 956 Py_ssize_t end, 957 int direction 958 ); 959 #endif 960 961 /* Count the number of occurrences of substr in str[start:end]. */ 962 963 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 964 PyObject *str, /* String */ 965 PyObject *substr, /* Substring to count */ 966 Py_ssize_t start, /* Start index */ 967 Py_ssize_t end /* Stop index */ 968 ); 969 970 /* Replace at most maxcount occurrences of substr in str with replstr 971 and return the resulting Unicode object. */ 972 973 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 974 PyObject *str, /* String */ 975 PyObject *substr, /* Substring to find */ 976 PyObject *replstr, /* Substring to replace */ 977 Py_ssize_t maxcount /* Max. number of replacements to apply; 978 -1 = all */ 979 ); 980 981 /* Compare two strings and return -1, 0, 1 for less than, equal, 982 greater than resp. 983 Raise an exception and return -1 on error. */ 984 985 PyAPI_FUNC(int) PyUnicode_Compare( 986 PyObject *left, /* Left string */ 987 PyObject *right /* Right string */ 988 ); 989 990 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 991 equal, and greater than, respectively. It is best to pass only 992 ASCII-encoded strings, but the function interprets the input string as 993 ISO-8859-1 if it contains non-ASCII characters. 994 This function does not raise exceptions. */ 995 996 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 997 PyObject *left, 998 const char *right /* ASCII-encoded string */ 999 ); 1000 1001 /* Rich compare two strings and return one of the following: 1002 1003 - NULL in case an exception was raised 1004 - Py_True or Py_False for successful comparisons 1005 - Py_NotImplemented in case the type combination is unknown 1006 1007 Possible values for op: 1008 1009 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1010 1011 */ 1012 1013 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1014 PyObject *left, /* Left string */ 1015 PyObject *right, /* Right string */ 1016 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1017 ); 1018 1019 /* Apply an argument tuple or dictionary to a format string and return 1020 the resulting Unicode string. */ 1021 1022 PyAPI_FUNC(PyObject *) PyUnicode_Format( 1023 PyObject *format, /* Format string */ 1024 PyObject *args /* Argument tuple or dictionary */ 1025 ); 1026 1027 /* Checks whether element is contained in container and return 1/0 1028 accordingly. 1029 1030 element has to coerce to a one element Unicode string. -1 is 1031 returned in case of an error. */ 1032 1033 PyAPI_FUNC(int) PyUnicode_Contains( 1034 PyObject *container, /* Container string */ 1035 PyObject *element /* Element string */ 1036 ); 1037 1038 /* Checks whether argument is a valid identifier. */ 1039 1040 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1041 1042 /* === Characters Type APIs =============================================== */ 1043 1044 #ifndef Py_LIMITED_API 1045 # define Py_CPYTHON_UNICODEOBJECT_H 1046 # include "cpython/unicodeobject.h" 1047 # undef Py_CPYTHON_UNICODEOBJECT_H 1048 #endif 1049 1050 #ifdef __cplusplus 1051 } 1052 #endif 1053 #endif /* !Py_UNICODEOBJECT_H */ 1054