1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 #include <stdarg.h> 5 6 /* 7 8 Unicode implementation based on original code by Fredrik Lundh, 9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10 Unicode Integration Proposal. (See 11 http://www.egenix.com/files/python/unicode-proposal.txt). 12 13 Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58 #include <ctype.h> 59 60 /* === Internal API ======================================================= */ 61 62 /* --- Internal Unicode Format -------------------------------------------- */ 63 64 /* Python 3.x requires unicode */ 65 #define Py_USING_UNICODE 66 67 #ifndef SIZEOF_WCHAR_T 68 #error Must define SIZEOF_WCHAR_T 69 #endif 70 71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77 #if Py_UNICODE_SIZE >= 4 78 #define Py_UNICODE_WIDE 79 #endif 80 81 /* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83 /* #define HAVE_WCHAR_H */ 84 /* #define HAVE_USABLE_WCHAR_T */ 85 86 /* If the compiler provides a wchar_t type we try to support it 87 through the interface functions PyUnicode_FromWideChar(), 88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 89 90 #ifdef HAVE_USABLE_WCHAR_T 91 # ifndef HAVE_WCHAR_H 92 # define HAVE_WCHAR_H 93 # endif 94 #endif 95 96 #ifdef HAVE_WCHAR_H 97 # include <wchar.h> 98 #endif 99 100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 101 unicode representations. */ 102 typedef uint32_t Py_UCS4; 103 typedef uint16_t Py_UCS2; 104 typedef uint8_t Py_UCS1; 105 106 #ifdef __cplusplus 107 extern "C" { 108 #endif 109 110 111 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 113 114 #define PyUnicode_Check(op) \ 115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type) 117 118 /* --- Constants ---------------------------------------------------------- */ 119 120 /* This Unicode character will be used as replacement character during 121 decoding if the errors argument is set to "replace". Note: the 122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 123 Unicode 3.0. */ 124 125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 126 127 /* === Public API ========================================================= */ 128 129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 131 const char *u, /* UTF-8 encoded string */ 132 Py_ssize_t size /* size of buffer */ 133 ); 134 135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 136 UTF-8 encoded bytes. The size is determined with strlen(). */ 137 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 138 const char *u /* UTF-8 encoded string */ 139 ); 140 141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 142 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 143 PyObject *str, 144 Py_ssize_t start, 145 Py_ssize_t end); 146 #endif 147 148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 149 /* Copy the string into a UCS4 buffer including the null character if copy_null 150 is set. Return NULL and raise an exception on error. Raise a SystemError if 151 the buffer is smaller than the string. Return buffer on success. 152 153 buflen is the length of the buffer in (Py_UCS4) characters. */ 154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 155 PyObject *unicode, 156 Py_UCS4* buffer, 157 Py_ssize_t buflen, 158 int copy_null); 159 160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 161 * PyMem_Malloc; if this fails, NULL is returned with a memory error 162 exception set. */ 163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 164 #endif 165 166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 167 /* Get the length of the Unicode object. */ 168 169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 170 PyObject *unicode 171 ); 172 #endif 173 174 /* Get the number of Py_UNICODE units in the 175 string representation. */ 176 177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 178 PyObject *unicode /* Unicode object */ 179 ); 180 181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 182 /* Read a character from the string. */ 183 184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 185 PyObject *unicode, 186 Py_ssize_t index 187 ); 188 189 /* Write a character to the string. The string must have been created through 190 PyUnicode_New, must not be shared, and must not have been hashed yet. 191 192 Return 0 on success, -1 on error. */ 193 194 PyAPI_FUNC(int) PyUnicode_WriteChar( 195 PyObject *unicode, 196 Py_ssize_t index, 197 Py_UCS4 character 198 ); 199 #endif 200 201 /* Resize a Unicode object. The length is the number of characters, except 202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 203 is the number of Py_UNICODE characters. 204 205 *unicode is modified to point to the new (resized) object and 0 206 returned on success. 207 208 Try to resize the string in place (which is usually faster than allocating 209 a new string and copy characters), or create a new string. 210 211 Error handling is implemented as follows: an exception is set, -1 212 is returned and *unicode left untouched. 213 214 WARNING: The function doesn't check string content, the result may not be a 215 string in canonical representation. */ 216 217 PyAPI_FUNC(int) PyUnicode_Resize( 218 PyObject **unicode, /* Pointer to the Unicode object */ 219 Py_ssize_t length /* New length */ 220 ); 221 222 /* Decode obj to a Unicode object. 223 224 bytes, bytearray and other bytes-like objects are decoded according to the 225 given encoding and error handler. The encoding and error handler can be 226 NULL to have the interface use UTF-8 and "strict". 227 228 All other objects (including Unicode objects) raise an exception. 229 230 The API returns NULL in case of an error. The caller is responsible 231 for decref'ing the returned objects. 232 233 */ 234 235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 236 PyObject *obj, /* Object */ 237 const char *encoding, /* encoding */ 238 const char *errors /* error handling */ 239 ); 240 241 /* Copy an instance of a Unicode subtype to a new true Unicode object if 242 necessary. If obj is already a true Unicode object (not a subtype), return 243 the reference with *incremented* refcount. 244 245 The API returns NULL in case of an error. The caller is responsible 246 for decref'ing the returned objects. 247 248 */ 249 250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 251 PyObject *obj /* Object */ 252 ); 253 254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 255 const char *format, /* ASCII-encoded string */ 256 va_list vargs 257 ); 258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 259 const char *format, /* ASCII-encoded string */ 260 ... 261 ); 262 263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 264 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 265 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 266 const char *u /* UTF-8 encoded string */ 267 ); 268 269 /* Use only if you know it's a string */ 270 #define PyUnicode_CHECK_INTERNED(op) \ 271 (((PyASCIIObject *)(op))->state.interned) 272 273 /* --- wchar_t support for platforms which support it --------------------- */ 274 275 #ifdef HAVE_WCHAR_H 276 277 /* Create a Unicode Object from the wchar_t buffer w of the given 278 size. 279 280 The buffer is copied into the new object. */ 281 282 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 283 const wchar_t *w, /* wchar_t buffer */ 284 Py_ssize_t size /* size of buffer */ 285 ); 286 287 /* Copies the Unicode Object contents into the wchar_t buffer w. At 288 most size wchar_t characters are copied. 289 290 Note that the resulting wchar_t string may or may not be 291 0-terminated. It is the responsibility of the caller to make sure 292 that the wchar_t string is 0-terminated in case this is required by 293 the application. 294 295 Returns the number of wchar_t characters copied (excluding a 296 possibly trailing 0-termination character) or -1 in case of an 297 error. */ 298 299 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 300 PyObject *unicode, /* Unicode object */ 301 wchar_t *w, /* wchar_t buffer */ 302 Py_ssize_t size /* size of buffer */ 303 ); 304 305 /* Convert the Unicode object to a wide character string. The output string 306 always ends with a nul character. If size is not NULL, write the number of 307 wide characters (excluding the null character) into *size. 308 309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 310 on success. On error, returns NULL, *size is undefined and raises a 311 MemoryError. */ 312 313 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 314 PyObject *unicode, /* Unicode object */ 315 Py_ssize_t *size /* number of characters of the result */ 316 ); 317 318 #endif 319 320 /* --- Unicode ordinals --------------------------------------------------- */ 321 322 /* Create a Unicode Object from the given Unicode code point ordinal. 323 324 The ordinal must be in range(0x110000). A ValueError is 325 raised in case it is not. 326 327 */ 328 329 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 330 331 /* === Builtin Codecs ===================================================== 332 333 Many of these APIs take two arguments encoding and errors. These 334 parameters encoding and errors have the same semantics as the ones 335 of the builtin str() API. 336 337 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 338 339 Error handling is set by errors which may also be set to NULL 340 meaning to use the default handling defined for the codec. Default 341 error handling for all builtin codecs is "strict" (ValueErrors are 342 raised). 343 344 The codecs all use a similar interface. Only deviation from the 345 generic ones are documented. 346 347 */ 348 349 /* --- Manage the default encoding ---------------------------------------- */ 350 351 /* Returns "utf-8". */ 352 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 353 354 /* --- Generic Codecs ----------------------------------------------------- */ 355 356 /* Create a Unicode object by decoding the encoded string s of the 357 given size. */ 358 359 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 360 const char *s, /* encoded string */ 361 Py_ssize_t size, /* size of buffer */ 362 const char *encoding, /* encoding */ 363 const char *errors /* error handling */ 364 ); 365 366 /* Decode a Unicode object unicode and return the result as Python 367 object. 368 369 This API is DEPRECATED. The only supported standard encoding is rot13. 370 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 371 that decode from str. */ 372 373 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 374 PyObject *unicode, /* Unicode object */ 375 const char *encoding, /* encoding */ 376 const char *errors /* error handling */ 377 ); 378 379 /* Decode a Unicode object unicode and return the result as Unicode 380 object. 381 382 This API is DEPRECATED. The only supported standard encoding is rot13. 383 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 384 that decode from str to str. */ 385 386 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 387 PyObject *unicode, /* Unicode object */ 388 const char *encoding, /* encoding */ 389 const char *errors /* error handling */ 390 ); 391 392 /* Encodes a Unicode object and returns the result as Python 393 object. 394 395 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 396 since all standard encodings (except rot13) encode str to bytes. 397 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 398 that encode form str to non-bytes. */ 399 400 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 401 PyObject *unicode, /* Unicode object */ 402 const char *encoding, /* encoding */ 403 const char *errors /* error handling */ 404 ); 405 406 /* Encodes a Unicode object and returns the result as Python string 407 object. */ 408 409 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 410 PyObject *unicode, /* Unicode object */ 411 const char *encoding, /* encoding */ 412 const char *errors /* error handling */ 413 ); 414 415 /* Encodes a Unicode object and returns the result as Unicode 416 object. 417 418 This API is DEPRECATED. The only supported standard encodings is rot13. 419 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 420 that encode from str to str. */ 421 422 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 423 PyObject *unicode, /* Unicode object */ 424 const char *encoding, /* encoding */ 425 const char *errors /* error handling */ 426 ); 427 428 /* Build an encoding map. */ 429 430 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 431 PyObject* string /* 256 character map */ 432 ); 433 434 /* --- UTF-7 Codecs ------------------------------------------------------- */ 435 436 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 437 const char *string, /* UTF-7 encoded string */ 438 Py_ssize_t length, /* size of string */ 439 const char *errors /* error handling */ 440 ); 441 442 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 443 const char *string, /* UTF-7 encoded string */ 444 Py_ssize_t length, /* size of string */ 445 const char *errors, /* error handling */ 446 Py_ssize_t *consumed /* bytes consumed */ 447 ); 448 449 /* --- UTF-8 Codecs ------------------------------------------------------- */ 450 451 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 452 const char *string, /* UTF-8 encoded string */ 453 Py_ssize_t length, /* size of string */ 454 const char *errors /* error handling */ 455 ); 456 457 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 458 const char *string, /* UTF-8 encoded string */ 459 Py_ssize_t length, /* size of string */ 460 const char *errors, /* error handling */ 461 Py_ssize_t *consumed /* bytes consumed */ 462 ); 463 464 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 465 PyObject *unicode /* Unicode object */ 466 ); 467 468 /* --- UTF-32 Codecs ------------------------------------------------------ */ 469 470 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 471 the corresponding Unicode object. 472 473 errors (if non-NULL) defines the error handling. It defaults 474 to "strict". 475 476 If byteorder is non-NULL, the decoder starts decoding using the 477 given byte order: 478 479 *byteorder == -1: little endian 480 *byteorder == 0: native order 481 *byteorder == 1: big endian 482 483 In native mode, the first four bytes of the stream are checked for a 484 BOM mark. If found, the BOM mark is analysed, the byte order 485 adjusted and the BOM skipped. In the other modes, no BOM mark 486 interpretation is done. After completion, *byteorder is set to the 487 current byte order at the end of input data. 488 489 If byteorder is NULL, the codec starts in native order mode. 490 491 */ 492 493 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 494 const char *string, /* UTF-32 encoded string */ 495 Py_ssize_t length, /* size of string */ 496 const char *errors, /* error handling */ 497 int *byteorder /* pointer to byteorder to use 498 0=native;-1=LE,1=BE; updated on 499 exit */ 500 ); 501 502 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 503 const char *string, /* UTF-32 encoded string */ 504 Py_ssize_t length, /* size of string */ 505 const char *errors, /* error handling */ 506 int *byteorder, /* pointer to byteorder to use 507 0=native;-1=LE,1=BE; updated on 508 exit */ 509 Py_ssize_t *consumed /* bytes consumed */ 510 ); 511 512 /* Returns a Python string using the UTF-32 encoding in native byte 513 order. The string always starts with a BOM mark. */ 514 515 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 516 PyObject *unicode /* Unicode object */ 517 ); 518 519 /* Returns a Python string object holding the UTF-32 encoded value of 520 the Unicode data. 521 522 If byteorder is not 0, output is written according to the following 523 byte order: 524 525 byteorder == -1: little endian 526 byteorder == 0: native byte order (writes a BOM mark) 527 byteorder == 1: big endian 528 529 If byteorder is 0, the output string will always start with the 530 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 531 prepended. 532 533 */ 534 535 /* --- UTF-16 Codecs ------------------------------------------------------ */ 536 537 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 538 the corresponding Unicode object. 539 540 errors (if non-NULL) defines the error handling. It defaults 541 to "strict". 542 543 If byteorder is non-NULL, the decoder starts decoding using the 544 given byte order: 545 546 *byteorder == -1: little endian 547 *byteorder == 0: native order 548 *byteorder == 1: big endian 549 550 In native mode, the first two bytes of the stream are checked for a 551 BOM mark. If found, the BOM mark is analysed, the byte order 552 adjusted and the BOM skipped. In the other modes, no BOM mark 553 interpretation is done. After completion, *byteorder is set to the 554 current byte order at the end of input data. 555 556 If byteorder is NULL, the codec starts in native order mode. 557 558 */ 559 560 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 561 const char *string, /* UTF-16 encoded string */ 562 Py_ssize_t length, /* size of string */ 563 const char *errors, /* error handling */ 564 int *byteorder /* pointer to byteorder to use 565 0=native;-1=LE,1=BE; updated on 566 exit */ 567 ); 568 569 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 570 const char *string, /* UTF-16 encoded string */ 571 Py_ssize_t length, /* size of string */ 572 const char *errors, /* error handling */ 573 int *byteorder, /* pointer to byteorder to use 574 0=native;-1=LE,1=BE; updated on 575 exit */ 576 Py_ssize_t *consumed /* bytes consumed */ 577 ); 578 579 /* Returns a Python string using the UTF-16 encoding in native byte 580 order. The string always starts with a BOM mark. */ 581 582 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 583 PyObject *unicode /* Unicode object */ 584 ); 585 586 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 587 588 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 589 const char *string, /* Unicode-Escape encoded string */ 590 Py_ssize_t length, /* size of string */ 591 const char *errors /* error handling */ 592 ); 593 594 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 595 PyObject *unicode /* Unicode object */ 596 ); 597 598 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 599 600 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 601 const char *string, /* Raw-Unicode-Escape encoded string */ 602 Py_ssize_t length, /* size of string */ 603 const char *errors /* error handling */ 604 ); 605 606 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 607 PyObject *unicode /* Unicode object */ 608 ); 609 610 /* --- Latin-1 Codecs ----------------------------------------------------- 611 612 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 613 614 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 615 const char *string, /* Latin-1 encoded string */ 616 Py_ssize_t length, /* size of string */ 617 const char *errors /* error handling */ 618 ); 619 620 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 621 PyObject *unicode /* Unicode object */ 622 ); 623 624 /* --- ASCII Codecs ------------------------------------------------------- 625 626 Only 7-bit ASCII data is excepted. All other codes generate errors. 627 628 */ 629 630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 631 const char *string, /* ASCII encoded string */ 632 Py_ssize_t length, /* size of string */ 633 const char *errors /* error handling */ 634 ); 635 636 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 637 PyObject *unicode /* Unicode object */ 638 ); 639 640 /* --- Character Map Codecs ----------------------------------------------- 641 642 This codec uses mappings to encode and decode characters. 643 644 Decoding mappings must map byte ordinals (integers in the range from 0 to 645 255) to Unicode strings, integers (which are then interpreted as Unicode 646 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 647 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 648 mapping" and cause an error. 649 650 Encoding mappings must map Unicode ordinal integers to bytes objects, 651 integers in the range from 0 to 255 or None. Unmapped character 652 ordinals (ones which cause a LookupError) as well as mapped to 653 None are treated as "undefined mapping" and cause an error. 654 655 */ 656 657 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 658 const char *string, /* Encoded string */ 659 Py_ssize_t length, /* size of string */ 660 PyObject *mapping, /* decoding mapping */ 661 const char *errors /* error handling */ 662 ); 663 664 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 665 PyObject *unicode, /* Unicode object */ 666 PyObject *mapping /* encoding mapping */ 667 ); 668 669 /* --- MBCS codecs for Windows -------------------------------------------- */ 670 671 #ifdef MS_WINDOWS 672 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 673 const char *string, /* MBCS encoded string */ 674 Py_ssize_t length, /* size of string */ 675 const char *errors /* error handling */ 676 ); 677 678 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 679 const char *string, /* MBCS encoded string */ 680 Py_ssize_t length, /* size of string */ 681 const char *errors, /* error handling */ 682 Py_ssize_t *consumed /* bytes consumed */ 683 ); 684 685 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 687 int code_page, /* code page number */ 688 const char *string, /* encoded string */ 689 Py_ssize_t length, /* size of string */ 690 const char *errors, /* error handling */ 691 Py_ssize_t *consumed /* bytes consumed */ 692 ); 693 #endif 694 695 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 696 PyObject *unicode /* Unicode object */ 697 ); 698 699 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 700 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 701 int code_page, /* code page number */ 702 PyObject *unicode, /* Unicode object */ 703 const char *errors /* error handling */ 704 ); 705 #endif 706 707 #endif /* MS_WINDOWS */ 708 709 /* --- Locale encoding --------------------------------------------------- */ 710 711 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 712 /* Decode a string from the current locale encoding. The decoder is strict if 713 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 714 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 715 be decoded as a surrogate character and *surrogateescape* is not equal to 716 zero, the byte sequence is escaped using the 'surrogateescape' error handler 717 instead of being decoded. *str* must end with a null character but cannot 718 contain embedded null characters. */ 719 720 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 721 const char *str, 722 Py_ssize_t len, 723 const char *errors); 724 725 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 726 length using strlen(). */ 727 728 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 729 const char *str, 730 const char *errors); 731 732 /* Encode a Unicode object to the current locale encoding. The encoder is 733 strict is *surrogateescape* is equal to zero, otherwise the 734 "surrogateescape" error handler is used. Return a bytes object. The string 735 cannot contain embedded null characters. */ 736 737 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 738 PyObject *unicode, 739 const char *errors 740 ); 741 #endif 742 743 /* --- File system encoding ---------------------------------------------- */ 744 745 /* ParseTuple converter: encode str objects to bytes using 746 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 747 748 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 749 750 /* ParseTuple converter: decode bytes objects to unicode using 751 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 752 753 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 754 755 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding 756 and the "surrogateescape" error handler. 757 758 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 759 encoding. 760 761 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 762 */ 763 764 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 765 const char *s /* encoded string */ 766 ); 767 768 /* Decode a string using Py_FileSystemDefaultEncoding 769 and the "surrogateescape" error handler. 770 771 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 772 encoding. 773 */ 774 775 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 776 const char *s, /* encoded string */ 777 Py_ssize_t size /* size */ 778 ); 779 780 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 781 "surrogateescape" error handler, and return bytes. 782 783 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 784 encoding. 785 */ 786 787 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 788 PyObject *unicode 789 ); 790 791 /* --- Methods & Slots ---------------------------------------------------- 792 793 These are capable of handling Unicode objects and strings on input 794 (we refer to them as strings in the descriptions) and return 795 Unicode objects or integers as appropriate. */ 796 797 /* Concat two strings giving a new Unicode string. */ 798 799 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 800 PyObject *left, /* Left string */ 801 PyObject *right /* Right string */ 802 ); 803 804 /* Concat two strings and put the result in *pleft 805 (sets *pleft to NULL on error) */ 806 807 PyAPI_FUNC(void) PyUnicode_Append( 808 PyObject **pleft, /* Pointer to left string */ 809 PyObject *right /* Right string */ 810 ); 811 812 /* Concat two strings, put the result in *pleft and drop the right object 813 (sets *pleft to NULL on error) */ 814 815 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 816 PyObject **pleft, /* Pointer to left string */ 817 PyObject *right /* Right string */ 818 ); 819 820 /* Split a string giving a list of Unicode strings. 821 822 If sep is NULL, splitting will be done at all whitespace 823 substrings. Otherwise, splits occur at the given separator. 824 825 At most maxsplit splits will be done. If negative, no limit is set. 826 827 Separators are not included in the resulting list. 828 829 */ 830 831 PyAPI_FUNC(PyObject*) PyUnicode_Split( 832 PyObject *s, /* String to split */ 833 PyObject *sep, /* String separator */ 834 Py_ssize_t maxsplit /* Maxsplit count */ 835 ); 836 837 /* Dito, but split at line breaks. 838 839 CRLF is considered to be one line break. Line breaks are not 840 included in the resulting list. */ 841 842 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 843 PyObject *s, /* String to split */ 844 int keepends /* If true, line end markers are included */ 845 ); 846 847 /* Partition a string using a given separator. */ 848 849 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 850 PyObject *s, /* String to partition */ 851 PyObject *sep /* String separator */ 852 ); 853 854 /* Partition a string using a given separator, searching from the end of the 855 string. */ 856 857 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 858 PyObject *s, /* String to partition */ 859 PyObject *sep /* String separator */ 860 ); 861 862 /* Split a string giving a list of Unicode strings. 863 864 If sep is NULL, splitting will be done at all whitespace 865 substrings. Otherwise, splits occur at the given separator. 866 867 At most maxsplit splits will be done. But unlike PyUnicode_Split 868 PyUnicode_RSplit splits from the end of the string. If negative, 869 no limit is set. 870 871 Separators are not included in the resulting list. 872 873 */ 874 875 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 876 PyObject *s, /* String to split */ 877 PyObject *sep, /* String separator */ 878 Py_ssize_t maxsplit /* Maxsplit count */ 879 ); 880 881 /* Translate a string by applying a character mapping table to it and 882 return the resulting Unicode object. 883 884 The mapping table must map Unicode ordinal integers to Unicode strings, 885 Unicode ordinal integers or None (causing deletion of the character). 886 887 Mapping tables may be dictionaries or sequences. Unmapped character 888 ordinals (ones which cause a LookupError) are left untouched and 889 are copied as-is. 890 891 */ 892 893 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 894 PyObject *str, /* String */ 895 PyObject *table, /* Translate table */ 896 const char *errors /* error handling */ 897 ); 898 899 /* Join a sequence of strings using the given separator and return 900 the resulting Unicode string. */ 901 902 PyAPI_FUNC(PyObject*) PyUnicode_Join( 903 PyObject *separator, /* Separator string */ 904 PyObject *seq /* Sequence object */ 905 ); 906 907 /* Return 1 if substr matches str[start:end] at the given tail end, 0 908 otherwise. */ 909 910 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 911 PyObject *str, /* String */ 912 PyObject *substr, /* Prefix or Suffix string */ 913 Py_ssize_t start, /* Start index */ 914 Py_ssize_t end, /* Stop index */ 915 int direction /* Tail end: -1 prefix, +1 suffix */ 916 ); 917 918 /* Return the first position of substr in str[start:end] using the 919 given search direction or -1 if not found. -2 is returned in case 920 an error occurred and an exception is set. */ 921 922 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 923 PyObject *str, /* String */ 924 PyObject *substr, /* Substring to find */ 925 Py_ssize_t start, /* Start index */ 926 Py_ssize_t end, /* Stop index */ 927 int direction /* Find direction: +1 forward, -1 backward */ 928 ); 929 930 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 931 /* Like PyUnicode_Find, but search for single character only. */ 932 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 933 PyObject *str, 934 Py_UCS4 ch, 935 Py_ssize_t start, 936 Py_ssize_t end, 937 int direction 938 ); 939 #endif 940 941 /* Count the number of occurrences of substr in str[start:end]. */ 942 943 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 944 PyObject *str, /* String */ 945 PyObject *substr, /* Substring to count */ 946 Py_ssize_t start, /* Start index */ 947 Py_ssize_t end /* Stop index */ 948 ); 949 950 /* Replace at most maxcount occurrences of substr in str with replstr 951 and return the resulting Unicode object. */ 952 953 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 954 PyObject *str, /* String */ 955 PyObject *substr, /* Substring to find */ 956 PyObject *replstr, /* Substring to replace */ 957 Py_ssize_t maxcount /* Max. number of replacements to apply; 958 -1 = all */ 959 ); 960 961 /* Compare two strings and return -1, 0, 1 for less than, equal, 962 greater than resp. 963 Raise an exception and return -1 on error. */ 964 965 PyAPI_FUNC(int) PyUnicode_Compare( 966 PyObject *left, /* Left string */ 967 PyObject *right /* Right string */ 968 ); 969 970 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 971 equal, and greater than, respectively. It is best to pass only 972 ASCII-encoded strings, but the function interprets the input string as 973 ISO-8859-1 if it contains non-ASCII characters. 974 This function does not raise exceptions. */ 975 976 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 977 PyObject *left, 978 const char *right /* ASCII-encoded string */ 979 ); 980 981 /* Rich compare two strings and return one of the following: 982 983 - NULL in case an exception was raised 984 - Py_True or Py_False for successful comparisons 985 - Py_NotImplemented in case the type combination is unknown 986 987 Possible values for op: 988 989 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 990 991 */ 992 993 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 994 PyObject *left, /* Left string */ 995 PyObject *right, /* Right string */ 996 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 997 ); 998 999 /* Apply an argument tuple or dictionary to a format string and return 1000 the resulting Unicode string. */ 1001 1002 PyAPI_FUNC(PyObject *) PyUnicode_Format( 1003 PyObject *format, /* Format string */ 1004 PyObject *args /* Argument tuple or dictionary */ 1005 ); 1006 1007 /* Checks whether element is contained in container and return 1/0 1008 accordingly. 1009 1010 element has to coerce to a one element Unicode string. -1 is 1011 returned in case of an error. */ 1012 1013 PyAPI_FUNC(int) PyUnicode_Contains( 1014 PyObject *container, /* Container string */ 1015 PyObject *element /* Element string */ 1016 ); 1017 1018 /* Checks whether argument is a valid identifier. */ 1019 1020 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1021 1022 /* === Characters Type APIs =============================================== */ 1023 1024 #ifndef Py_LIMITED_API 1025 # define Py_CPYTHON_UNICODEOBJECT_H 1026 # include "cpython/unicodeobject.h" 1027 # undef Py_CPYTHON_UNICODEOBJECT_H 1028 #endif 1029 1030 #ifdef __cplusplus 1031 } 1032 #endif 1033 #endif /* !Py_UNICODEOBJECT_H */ 1034