1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 #include <stdarg.h> 5 6 /* 7 8 Unicode implementation based on original code by Fredrik Lundh, 9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the 10 Unicode Integration Proposal. (See 11 http://www.egenix.com/files/python/unicode-proposal.txt). 12 13 Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * fredrik@pythonware.com 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58 #include <ctype.h> 59 60 /* === Internal API ======================================================= */ 61 62 /* --- Internal Unicode Format -------------------------------------------- */ 63 64 /* Python 3.x requires unicode */ 65 #define Py_USING_UNICODE 66 67 #ifndef SIZEOF_WCHAR_T 68 #error Must define SIZEOF_WCHAR_T 69 #endif 70 71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77 #if Py_UNICODE_SIZE >= 4 78 #define Py_UNICODE_WIDE 79 #endif 80 81 /* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83 /* #define HAVE_WCHAR_H */ 84 /* #define HAVE_USABLE_WCHAR_T */ 85 86 /* Py_UNICODE was the native Unicode storage format (code unit) used by 87 Python and represents a single Unicode element in the Unicode type. 88 With PEP 393, Py_UNICODE is deprecated and replaced with a 89 typedef to wchar_t. */ 90 91 #ifndef Py_LIMITED_API 92 #define PY_UNICODE_TYPE wchar_t 93 typedef wchar_t Py_UNICODE; 94 #endif 95 96 /* If the compiler provides a wchar_t type we try to support it 97 through the interface functions PyUnicode_FromWideChar(), 98 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 99 100 #ifdef HAVE_USABLE_WCHAR_T 101 # ifndef HAVE_WCHAR_H 102 # define HAVE_WCHAR_H 103 # endif 104 #endif 105 106 #ifdef HAVE_WCHAR_H 107 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */ 108 # ifdef _HAVE_BSDI 109 # include <time.h> 110 # endif 111 # include <wchar.h> 112 #endif 113 114 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 115 unicode representations. */ 116 typedef uint32_t Py_UCS4; 117 typedef uint16_t Py_UCS2; 118 typedef uint8_t Py_UCS1; 119 120 /* --- Internal Unicode Operations ---------------------------------------- */ 121 122 /* Since splitting on whitespace is an important use case, and 123 whitespace in most situations is solely ASCII whitespace, we 124 optimize for the common case by using a quick look-up table 125 _Py_ascii_whitespace (see below) with an inlined check. 126 127 */ 128 #ifndef Py_LIMITED_API 129 #define Py_UNICODE_ISSPACE(ch) \ 130 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) 131 132 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) 133 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) 134 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) 135 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) 136 137 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) 138 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) 139 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) 140 141 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) 142 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) 143 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) 144 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) 145 146 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) 147 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) 148 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) 149 150 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) 151 152 #define Py_UNICODE_ISALNUM(ch) \ 153 (Py_UNICODE_ISALPHA(ch) || \ 154 Py_UNICODE_ISDECIMAL(ch) || \ 155 Py_UNICODE_ISDIGIT(ch) || \ 156 Py_UNICODE_ISNUMERIC(ch)) 157 158 #define Py_UNICODE_COPY(target, source, length) \ 159 memcpy((target), (source), (length)*sizeof(Py_UNICODE)) 160 161 #define Py_UNICODE_FILL(target, value, length) \ 162 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ 163 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ 164 } while (0) 165 166 /* macros to work with surrogates */ 167 #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) 168 #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) 169 #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) 170 /* Join two surrogate characters and return a single Py_UCS4 value. */ 171 #define Py_UNICODE_JOIN_SURROGATES(high, low) \ 172 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 173 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 174 /* high surrogate = top 10 bits added to D800 */ 175 #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) 176 /* low surrogate = bottom 10 bits added to DC00 */ 177 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) 178 179 /* Check if substring matches at given offset. The offset must be 180 valid, and the substring must not be empty. */ 181 182 #define Py_UNICODE_MATCH(string, offset, substring) \ 183 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ 184 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ 185 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) 186 187 #endif /* Py_LIMITED_API */ 188 189 #ifdef __cplusplus 190 extern "C" { 191 #endif 192 193 /* --- Unicode Type ------------------------------------------------------- */ 194 195 #ifndef Py_LIMITED_API 196 197 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject 198 structure. state.ascii and state.compact are set, and the data 199 immediately follow the structure. utf8_length and wstr_length can be found 200 in the length field; the utf8 pointer is equal to the data pointer. */ 201 typedef struct { 202 /* There are 4 forms of Unicode strings: 203 204 - compact ascii: 205 206 * structure = PyASCIIObject 207 * test: PyUnicode_IS_COMPACT_ASCII(op) 208 * kind = PyUnicode_1BYTE_KIND 209 * compact = 1 210 * ascii = 1 211 * ready = 1 212 * (length is the length of the utf8 and wstr strings) 213 * (data starts just after the structure) 214 * (since ASCII is decoded from UTF-8, the utf8 string are the data) 215 216 - compact: 217 218 * structure = PyCompactUnicodeObject 219 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) 220 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 221 PyUnicode_4BYTE_KIND 222 * compact = 1 223 * ready = 1 224 * ascii = 0 225 * utf8 is not shared with data 226 * utf8_length = 0 if utf8 is NULL 227 * wstr is shared with data and wstr_length=length 228 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 229 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 230 * wstr_length = 0 if wstr is NULL 231 * (data starts just after the structure) 232 233 - legacy string, not ready: 234 235 * structure = PyUnicodeObject 236 * test: kind == PyUnicode_WCHAR_KIND 237 * length = 0 (use wstr_length) 238 * hash = -1 239 * kind = PyUnicode_WCHAR_KIND 240 * compact = 0 241 * ascii = 0 242 * ready = 0 243 * interned = SSTATE_NOT_INTERNED 244 * wstr is not NULL 245 * data.any is NULL 246 * utf8 is NULL 247 * utf8_length = 0 248 249 - legacy string, ready: 250 251 * structure = PyUnicodeObject structure 252 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND 253 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or 254 PyUnicode_4BYTE_KIND 255 * compact = 0 256 * ready = 1 257 * data.any is not NULL 258 * utf8 is shared and utf8_length = length with data.any if ascii = 1 259 * utf8_length = 0 if utf8 is NULL 260 * wstr is shared with data.any and wstr_length = length 261 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 262 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 263 * wstr_length = 0 if wstr is NULL 264 265 Compact strings use only one memory block (structure + characters), 266 whereas legacy strings use one block for the structure and one block 267 for characters. 268 269 Legacy strings are created by PyUnicode_FromUnicode() and 270 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready 271 when PyUnicode_READY() is called. 272 273 See also _PyUnicode_CheckConsistency(). 274 */ 275 PyObject_HEAD 276 Py_ssize_t length; /* Number of code points in the string */ 277 Py_hash_t hash; /* Hash value; -1 if not set */ 278 struct { 279 /* 280 SSTATE_NOT_INTERNED (0) 281 SSTATE_INTERNED_MORTAL (1) 282 SSTATE_INTERNED_IMMORTAL (2) 283 284 If interned != SSTATE_NOT_INTERNED, the two references from the 285 dictionary to this object are *not* counted in ob_refcnt. 286 */ 287 unsigned int interned:2; 288 /* Character size: 289 290 - PyUnicode_WCHAR_KIND (0): 291 292 * character type = wchar_t (16 or 32 bits, depending on the 293 platform) 294 295 - PyUnicode_1BYTE_KIND (1): 296 297 * character type = Py_UCS1 (8 bits, unsigned) 298 * all characters are in the range U+0000-U+00FF (latin1) 299 * if ascii is set, all characters are in the range U+0000-U+007F 300 (ASCII), otherwise at least one character is in the range 301 U+0080-U+00FF 302 303 - PyUnicode_2BYTE_KIND (2): 304 305 * character type = Py_UCS2 (16 bits, unsigned) 306 * all characters are in the range U+0000-U+FFFF (BMP) 307 * at least one character is in the range U+0100-U+FFFF 308 309 - PyUnicode_4BYTE_KIND (4): 310 311 * character type = Py_UCS4 (32 bits, unsigned) 312 * all characters are in the range U+0000-U+10FFFF 313 * at least one character is in the range U+10000-U+10FFFF 314 */ 315 unsigned int kind:3; 316 /* Compact is with respect to the allocation scheme. Compact unicode 317 objects only require one memory block while non-compact objects use 318 one block for the PyUnicodeObject struct and another for its data 319 buffer. */ 320 unsigned int compact:1; 321 /* The string only contains characters in the range U+0000-U+007F (ASCII) 322 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is 323 set, use the PyASCIIObject structure. */ 324 unsigned int ascii:1; 325 /* The ready flag indicates whether the object layout is initialized 326 completely. This means that this is either a compact object, or 327 the data pointer is filled out. The bit is redundant, and helps 328 to minimize the test in PyUnicode_IS_READY(). */ 329 unsigned int ready:1; 330 /* Padding to ensure that PyUnicode_DATA() is always aligned to 331 4 bytes (see issue #19537 on m68k). */ 332 unsigned int :24; 333 } state; 334 wchar_t *wstr; /* wchar_t representation (null-terminated) */ 335 } PyASCIIObject; 336 337 /* Non-ASCII strings allocated through PyUnicode_New use the 338 PyCompactUnicodeObject structure. state.compact is set, and the data 339 immediately follow the structure. */ 340 typedef struct { 341 PyASCIIObject _base; 342 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the 343 * terminating \0. */ 344 char *utf8; /* UTF-8 representation (null-terminated) */ 345 Py_ssize_t wstr_length; /* Number of code points in wstr, possible 346 * surrogates count as two code points. */ 347 } PyCompactUnicodeObject; 348 349 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the 350 PyUnicodeObject structure. The actual string data is initially in the wstr 351 block, and copied into the data block using _PyUnicode_Ready. */ 352 typedef struct { 353 PyCompactUnicodeObject _base; 354 union { 355 void *any; 356 Py_UCS1 *latin1; 357 Py_UCS2 *ucs2; 358 Py_UCS4 *ucs4; 359 } data; /* Canonical, smallest-form Unicode buffer */ 360 } PyUnicodeObject; 361 #endif 362 363 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 364 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 365 366 #define PyUnicode_Check(op) \ 367 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 368 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type) 369 370 /* Fast access macros */ 371 #ifndef Py_LIMITED_API 372 373 #define PyUnicode_WSTR_LENGTH(op) \ 374 (PyUnicode_IS_COMPACT_ASCII(op) ? \ 375 ((PyASCIIObject*)op)->length : \ 376 ((PyCompactUnicodeObject*)op)->wstr_length) 377 378 /* Returns the deprecated Py_UNICODE representation's size in code units 379 (this includes surrogate pairs as 2 units). 380 If the Py_UNICODE representation is not available, it will be computed 381 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ 382 383 #define PyUnicode_GET_SIZE(op) \ 384 (assert(PyUnicode_Check(op)), \ 385 (((PyASCIIObject *)(op))->wstr) ? \ 386 PyUnicode_WSTR_LENGTH(op) : \ 387 ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ 388 assert(((PyASCIIObject *)(op))->wstr), \ 389 PyUnicode_WSTR_LENGTH(op))) 390 391 #define PyUnicode_GET_DATA_SIZE(op) \ 392 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) 393 394 /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE 395 representation on demand. Using this macro is very inefficient now, 396 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or 397 use PyUnicode_WRITE() and PyUnicode_READ(). */ 398 399 #define PyUnicode_AS_UNICODE(op) \ 400 (assert(PyUnicode_Check(op)), \ 401 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ 402 PyUnicode_AsUnicode((PyObject *)(op))) 403 404 #define PyUnicode_AS_DATA(op) \ 405 ((const char *)(PyUnicode_AS_UNICODE(op))) 406 407 408 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ 409 410 /* Values for PyASCIIObject.state: */ 411 412 /* Interning state. */ 413 #define SSTATE_NOT_INTERNED 0 414 #define SSTATE_INTERNED_MORTAL 1 415 #define SSTATE_INTERNED_IMMORTAL 2 416 417 /* Return true if the string contains only ASCII characters, or 0 if not. The 418 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be 419 ready. */ 420 #define PyUnicode_IS_ASCII(op) \ 421 (assert(PyUnicode_Check(op)), \ 422 assert(PyUnicode_IS_READY(op)), \ 423 ((PyASCIIObject*)op)->state.ascii) 424 425 /* Return true if the string is compact or 0 if not. 426 No type checks or Ready calls are performed. */ 427 #define PyUnicode_IS_COMPACT(op) \ 428 (((PyASCIIObject*)(op))->state.compact) 429 430 /* Return true if the string is a compact ASCII string (use PyASCIIObject 431 structure), or 0 if not. No type checks or Ready calls are performed. */ 432 #define PyUnicode_IS_COMPACT_ASCII(op) \ 433 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) 434 435 enum PyUnicode_Kind { 436 /* String contains only wstr byte characters. This is only possible 437 when the string was created with a legacy API and _PyUnicode_Ready() 438 has not been called yet. */ 439 PyUnicode_WCHAR_KIND = 0, 440 /* Return values of the PyUnicode_KIND() macro: */ 441 PyUnicode_1BYTE_KIND = 1, 442 PyUnicode_2BYTE_KIND = 2, 443 PyUnicode_4BYTE_KIND = 4 444 }; 445 446 /* Return pointers to the canonical representation cast to unsigned char, 447 Py_UCS2, or Py_UCS4 for direct character access. 448 No checks are performed, use PyUnicode_KIND() before to ensure 449 these will work correctly. */ 450 451 #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) 452 #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) 453 #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) 454 455 /* Return one of the PyUnicode_*_KIND values defined above. */ 456 #define PyUnicode_KIND(op) \ 457 (assert(PyUnicode_Check(op)), \ 458 assert(PyUnicode_IS_READY(op)), \ 459 ((PyASCIIObject *)(op))->state.kind) 460 461 /* Return a void pointer to the raw unicode buffer. */ 462 #define _PyUnicode_COMPACT_DATA(op) \ 463 (PyUnicode_IS_ASCII(op) ? \ 464 ((void*)((PyASCIIObject*)(op) + 1)) : \ 465 ((void*)((PyCompactUnicodeObject*)(op) + 1))) 466 467 #define _PyUnicode_NONCOMPACT_DATA(op) \ 468 (assert(((PyUnicodeObject*)(op))->data.any), \ 469 ((((PyUnicodeObject *)(op))->data.any))) 470 471 #define PyUnicode_DATA(op) \ 472 (assert(PyUnicode_Check(op)), \ 473 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ 474 _PyUnicode_NONCOMPACT_DATA(op)) 475 476 /* In the access macros below, "kind" may be evaluated more than once. 477 All other macro parameters are evaluated exactly once, so it is safe 478 to put side effects into them (such as increasing the index). */ 479 480 /* Write into the canonical representation, this macro does not do any sanity 481 checks and is intended for usage in loops. The caller should cache the 482 kind and data pointers obtained from other macro calls. 483 index is the index in the string (starts at 0) and value is the new 484 code point value which should be written to that location. */ 485 #define PyUnicode_WRITE(kind, data, index, value) \ 486 do { \ 487 switch ((kind)) { \ 488 case PyUnicode_1BYTE_KIND: { \ 489 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ 490 break; \ 491 } \ 492 case PyUnicode_2BYTE_KIND: { \ 493 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ 494 break; \ 495 } \ 496 default: { \ 497 assert((kind) == PyUnicode_4BYTE_KIND); \ 498 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ 499 } \ 500 } \ 501 } while (0) 502 503 /* Read a code point from the string's canonical representation. No checks 504 or ready calls are performed. */ 505 #define PyUnicode_READ(kind, data, index) \ 506 ((Py_UCS4) \ 507 ((kind) == PyUnicode_1BYTE_KIND ? \ 508 ((const Py_UCS1 *)(data))[(index)] : \ 509 ((kind) == PyUnicode_2BYTE_KIND ? \ 510 ((const Py_UCS2 *)(data))[(index)] : \ 511 ((const Py_UCS4 *)(data))[(index)] \ 512 ) \ 513 )) 514 515 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it 516 calls PyUnicode_KIND() and might call it twice. For single reads, use 517 PyUnicode_READ_CHAR, for multiple consecutive reads callers should 518 cache kind and use PyUnicode_READ instead. */ 519 #define PyUnicode_READ_CHAR(unicode, index) \ 520 (assert(PyUnicode_Check(unicode)), \ 521 assert(PyUnicode_IS_READY(unicode)), \ 522 (Py_UCS4) \ 523 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ 524 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ 525 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ 526 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ 527 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ 528 ) \ 529 )) 530 531 /* Returns the length of the unicode string. The caller has to make sure that 532 the string has it's canonical representation set before calling 533 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ 534 #define PyUnicode_GET_LENGTH(op) \ 535 (assert(PyUnicode_Check(op)), \ 536 assert(PyUnicode_IS_READY(op)), \ 537 ((PyASCIIObject *)(op))->length) 538 539 540 /* Fast check to determine whether an object is ready. Equivalent to 541 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ 542 543 #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) 544 545 /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best 546 case. If the canonical representation is not yet set, it will still call 547 _PyUnicode_Ready(). 548 Returns 0 on success and -1 on errors. */ 549 #define PyUnicode_READY(op) \ 550 (assert(PyUnicode_Check(op)), \ 551 (PyUnicode_IS_READY(op) ? \ 552 0 : _PyUnicode_Ready((PyObject *)(op)))) 553 554 /* Return a maximum character value which is suitable for creating another 555 string based on op. This is always an approximation but more efficient 556 than iterating over the string. */ 557 #define PyUnicode_MAX_CHAR_VALUE(op) \ 558 (assert(PyUnicode_IS_READY(op)), \ 559 (PyUnicode_IS_ASCII(op) ? \ 560 (0x7f) : \ 561 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ 562 (0xffU) : \ 563 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ 564 (0xffffU) : \ 565 (0x10ffffU))))) 566 567 #endif 568 569 /* --- Constants ---------------------------------------------------------- */ 570 571 /* This Unicode character will be used as replacement character during 572 decoding if the errors argument is set to "replace". Note: the 573 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 574 Unicode 3.0. */ 575 576 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 577 578 /* === Public API ========================================================= */ 579 580 /* --- Plain Py_UNICODE --------------------------------------------------- */ 581 582 /* With PEP 393, this is the recommended way to allocate a new unicode object. 583 This function will allocate the object and its buffer in a single memory 584 block. Objects created using this function are not resizable. */ 585 #ifndef Py_LIMITED_API 586 PyAPI_FUNC(PyObject*) PyUnicode_New( 587 Py_ssize_t size, /* Number of code points in the new string */ 588 Py_UCS4 maxchar /* maximum code point value in the string */ 589 ); 590 #endif 591 592 /* Initializes the canonical string representation from the deprecated 593 wstr/Py_UNICODE representation. This function is used to convert Unicode 594 objects which were created using the old API to the new flexible format 595 introduced with PEP 393. 596 597 Don't call this function directly, use the public PyUnicode_READY() macro 598 instead. */ 599 #ifndef Py_LIMITED_API 600 PyAPI_FUNC(int) _PyUnicode_Ready( 601 PyObject *unicode /* Unicode object */ 602 ); 603 #endif 604 605 /* Get a copy of a Unicode string. */ 606 #ifndef Py_LIMITED_API 607 PyAPI_FUNC(PyObject*) _PyUnicode_Copy( 608 PyObject *unicode 609 ); 610 #endif 611 612 /* Copy character from one unicode object into another, this function performs 613 character conversion when necessary and falls back to memcpy() if possible. 614 615 Fail if to is too small (smaller than *how_many* or smaller than 616 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > 617 kind(to), or if *to* has more than 1 reference. 618 619 Return the number of written character, or return -1 and raise an exception 620 on error. 621 622 Pseudo-code: 623 624 how_many = min(how_many, len(from) - from_start) 625 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] 626 return how_many 627 628 Note: The function doesn't write a terminating null character. 629 */ 630 #ifndef Py_LIMITED_API 631 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( 632 PyObject *to, 633 Py_ssize_t to_start, 634 PyObject *from, 635 Py_ssize_t from_start, 636 Py_ssize_t how_many 637 ); 638 639 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so 640 may crash if parameters are invalid (e.g. if the output string 641 is too short). */ 642 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( 643 PyObject *to, 644 Py_ssize_t to_start, 645 PyObject *from, 646 Py_ssize_t from_start, 647 Py_ssize_t how_many 648 ); 649 #endif 650 651 #ifndef Py_LIMITED_API 652 /* Fill a string with a character: write fill_char into 653 unicode[start:start+length]. 654 655 Fail if fill_char is bigger than the string maximum character, or if the 656 string has more than 1 reference. 657 658 Return the number of written character, or return -1 and raise an exception 659 on error. */ 660 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( 661 PyObject *unicode, 662 Py_ssize_t start, 663 Py_ssize_t length, 664 Py_UCS4 fill_char 665 ); 666 667 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash 668 if parameters are invalid (e.g. if length is longer than the string). */ 669 PyAPI_FUNC(void) _PyUnicode_FastFill( 670 PyObject *unicode, 671 Py_ssize_t start, 672 Py_ssize_t length, 673 Py_UCS4 fill_char 674 ); 675 #endif 676 677 /* Create a Unicode Object from the Py_UNICODE buffer u of the given 678 size. 679 680 u may be NULL which causes the contents to be undefined. It is the 681 user's responsibility to fill in the needed data afterwards. Note 682 that modifying the Unicode object contents after construction is 683 only allowed if u was set to NULL. 684 685 The buffer is copied into the new object. */ 686 687 #ifndef Py_LIMITED_API 688 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( 689 const Py_UNICODE *u, /* Unicode buffer */ 690 Py_ssize_t size /* size of buffer */ 691 ); 692 #endif 693 694 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 695 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 696 const char *u, /* UTF-8 encoded string */ 697 Py_ssize_t size /* size of buffer */ 698 ); 699 700 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 701 UTF-8 encoded bytes. The size is determined with strlen(). */ 702 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 703 const char *u /* UTF-8 encoded string */ 704 ); 705 706 #ifndef Py_LIMITED_API 707 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. 708 Scan the string to find the maximum character. */ 709 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( 710 int kind, 711 const void *buffer, 712 Py_ssize_t size); 713 714 /* Create a new string from a buffer of ASCII characters. 715 WARNING: Don't check if the string contains any non-ASCII character. */ 716 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( 717 const char *buffer, 718 Py_ssize_t size); 719 #endif 720 721 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 722 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 723 PyObject *str, 724 Py_ssize_t start, 725 Py_ssize_t end); 726 #endif 727 728 #ifndef Py_LIMITED_API 729 /* Compute the maximum character of the substring unicode[start:end]. 730 Return 127 for an empty string. */ 731 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( 732 PyObject *unicode, 733 Py_ssize_t start, 734 Py_ssize_t end); 735 #endif 736 737 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 738 /* Copy the string into a UCS4 buffer including the null character if copy_null 739 is set. Return NULL and raise an exception on error. Raise a SystemError if 740 the buffer is smaller than the string. Return buffer on success. 741 742 buflen is the length of the buffer in (Py_UCS4) characters. */ 743 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 744 PyObject *unicode, 745 Py_UCS4* buffer, 746 Py_ssize_t buflen, 747 int copy_null); 748 749 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 750 * PyMem_Malloc; if this fails, NULL is returned with a memory error 751 exception set. */ 752 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 753 #endif 754 755 /* Return a read-only pointer to the Unicode object's internal 756 Py_UNICODE buffer. 757 If the wchar_t/Py_UNICODE representation is not yet available, this 758 function will calculate it. */ 759 760 #ifndef Py_LIMITED_API 761 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( 762 PyObject *unicode /* Unicode object */ 763 ); 764 #endif 765 766 /* Return a read-only pointer to the Unicode object's internal 767 Py_UNICODE buffer and save the length at size. 768 If the wchar_t/Py_UNICODE representation is not yet available, this 769 function will calculate it. */ 770 771 #ifndef Py_LIMITED_API 772 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( 773 PyObject *unicode, /* Unicode object */ 774 Py_ssize_t *size /* location where to save the length */ 775 ); 776 #endif 777 778 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 779 /* Get the length of the Unicode object. */ 780 781 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 782 PyObject *unicode 783 ); 784 #endif 785 786 /* Get the number of Py_UNICODE units in the 787 string representation. */ 788 789 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 790 PyObject *unicode /* Unicode object */ 791 ); 792 793 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 794 /* Read a character from the string. */ 795 796 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 797 PyObject *unicode, 798 Py_ssize_t index 799 ); 800 801 /* Write a character to the string. The string must have been created through 802 PyUnicode_New, must not be shared, and must not have been hashed yet. 803 804 Return 0 on success, -1 on error. */ 805 806 PyAPI_FUNC(int) PyUnicode_WriteChar( 807 PyObject *unicode, 808 Py_ssize_t index, 809 Py_UCS4 character 810 ); 811 #endif 812 813 #ifndef Py_LIMITED_API 814 /* Get the maximum ordinal for a Unicode character. */ 815 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); 816 #endif 817 818 /* Resize a Unicode object. The length is the number of characters, except 819 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 820 is the number of Py_UNICODE characters. 821 822 *unicode is modified to point to the new (resized) object and 0 823 returned on success. 824 825 Try to resize the string in place (which is usually faster than allocating 826 a new string and copy characters), or create a new string. 827 828 Error handling is implemented as follows: an exception is set, -1 829 is returned and *unicode left untouched. 830 831 WARNING: The function doesn't check string content, the result may not be a 832 string in canonical representation. */ 833 834 PyAPI_FUNC(int) PyUnicode_Resize( 835 PyObject **unicode, /* Pointer to the Unicode object */ 836 Py_ssize_t length /* New length */ 837 ); 838 839 /* Decode obj to a Unicode object. 840 841 bytes, bytearray and other bytes-like objects are decoded according to the 842 given encoding and error handler. The encoding and error handler can be 843 NULL to have the interface use UTF-8 and "strict". 844 845 All other objects (including Unicode objects) raise an exception. 846 847 The API returns NULL in case of an error. The caller is responsible 848 for decref'ing the returned objects. 849 850 */ 851 852 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 853 PyObject *obj, /* Object */ 854 const char *encoding, /* encoding */ 855 const char *errors /* error handling */ 856 ); 857 858 /* Copy an instance of a Unicode subtype to a new true Unicode object if 859 necessary. If obj is already a true Unicode object (not a subtype), return 860 the reference with *incremented* refcount. 861 862 The API returns NULL in case of an error. The caller is responsible 863 for decref'ing the returned objects. 864 865 */ 866 867 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 868 PyObject *obj /* Object */ 869 ); 870 871 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 872 const char *format, /* ASCII-encoded string */ 873 va_list vargs 874 ); 875 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 876 const char *format, /* ASCII-encoded string */ 877 ... 878 ); 879 880 #ifndef Py_LIMITED_API 881 typedef struct { 882 PyObject *buffer; 883 void *data; 884 enum PyUnicode_Kind kind; 885 Py_UCS4 maxchar; 886 Py_ssize_t size; 887 Py_ssize_t pos; 888 889 /* minimum number of allocated characters (default: 0) */ 890 Py_ssize_t min_length; 891 892 /* minimum character (default: 127, ASCII) */ 893 Py_UCS4 min_char; 894 895 /* If non-zero, overallocate the buffer (default: 0). */ 896 unsigned char overallocate; 897 898 /* If readonly is 1, buffer is a shared string (cannot be modified) 899 and size is set to 0. */ 900 unsigned char readonly; 901 } _PyUnicodeWriter ; 902 903 /* Initialize a Unicode writer. 904 * 905 * By default, the minimum buffer size is 0 character and overallocation is 906 * disabled. Set min_length, min_char and overallocate attributes to control 907 * the allocation of the buffer. */ 908 PyAPI_FUNC(void) 909 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer); 910 911 /* Prepare the buffer to write 'length' characters 912 with the specified maximum character. 913 914 Return 0 on success, raise an exception and return -1 on error. */ 915 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ 916 (((MAXCHAR) <= (WRITER)->maxchar \ 917 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ 918 ? 0 \ 919 : (((LENGTH) == 0) \ 920 ? 0 \ 921 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) 922 923 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro 924 instead. */ 925 PyAPI_FUNC(int) 926 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 927 Py_ssize_t length, Py_UCS4 maxchar); 928 929 /* Prepare the buffer to have at least the kind KIND. 930 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will 931 support characters in range U+000-U+FFFF. 932 933 Return 0 on success, raise an exception and return -1 on error. */ 934 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ 935 (assert((KIND) != PyUnicode_WCHAR_KIND), \ 936 (KIND) <= (WRITER)->kind \ 937 ? 0 \ 938 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) 939 940 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() 941 macro instead. */ 942 PyAPI_FUNC(int) 943 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 944 enum PyUnicode_Kind kind); 945 946 /* Append a Unicode character. 947 Return 0 on success, raise an exception and return -1 on error. */ 948 PyAPI_FUNC(int) 949 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, 950 Py_UCS4 ch 951 ); 952 953 /* Append a Unicode string. 954 Return 0 on success, raise an exception and return -1 on error. */ 955 PyAPI_FUNC(int) 956 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, 957 PyObject *str /* Unicode string */ 958 ); 959 960 /* Append a substring of a Unicode string. 961 Return 0 on success, raise an exception and return -1 on error. */ 962 PyAPI_FUNC(int) 963 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, 964 PyObject *str, /* Unicode string */ 965 Py_ssize_t start, 966 Py_ssize_t end 967 ); 968 969 /* Append an ASCII-encoded byte string. 970 Return 0 on success, raise an exception and return -1 on error. */ 971 PyAPI_FUNC(int) 972 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 973 const char *str, /* ASCII-encoded byte string */ 974 Py_ssize_t len /* number of bytes, or -1 if unknown */ 975 ); 976 977 /* Append a latin1-encoded byte string. 978 Return 0 on success, raise an exception and return -1 on error. */ 979 PyAPI_FUNC(int) 980 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 981 const char *str, /* latin1-encoded byte string */ 982 Py_ssize_t len /* length in bytes */ 983 ); 984 985 /* Get the value of the writer as a Unicode string. Clear the 986 buffer of the writer. Raise an exception and return NULL 987 on error. */ 988 PyAPI_FUNC(PyObject *) 989 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); 990 991 /* Deallocate memory of a writer (clear its internal buffer). */ 992 PyAPI_FUNC(void) 993 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); 994 #endif 995 996 #ifndef Py_LIMITED_API 997 /* Format the object based on the format_spec, as defined in PEP 3101 998 (Advanced String Formatting). */ 999 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( 1000 _PyUnicodeWriter *writer, 1001 PyObject *obj, 1002 PyObject *format_spec, 1003 Py_ssize_t start, 1004 Py_ssize_t end); 1005 #endif 1006 1007 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 1008 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 1009 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 1010 const char *u /* UTF-8 encoded string */ 1011 ); 1012 #ifndef Py_LIMITED_API 1013 PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void); 1014 #endif 1015 1016 /* Use only if you know it's a string */ 1017 #define PyUnicode_CHECK_INTERNED(op) \ 1018 (((PyASCIIObject *)(op))->state.interned) 1019 1020 /* --- wchar_t support for platforms which support it --------------------- */ 1021 1022 #ifdef HAVE_WCHAR_H 1023 1024 /* Create a Unicode Object from the wchar_t buffer w of the given 1025 size. 1026 1027 The buffer is copied into the new object. */ 1028 1029 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 1030 const wchar_t *w, /* wchar_t buffer */ 1031 Py_ssize_t size /* size of buffer */ 1032 ); 1033 1034 /* Copies the Unicode Object contents into the wchar_t buffer w. At 1035 most size wchar_t characters are copied. 1036 1037 Note that the resulting wchar_t string may or may not be 1038 0-terminated. It is the responsibility of the caller to make sure 1039 that the wchar_t string is 0-terminated in case this is required by 1040 the application. 1041 1042 Returns the number of wchar_t characters copied (excluding a 1043 possibly trailing 0-termination character) or -1 in case of an 1044 error. */ 1045 1046 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 1047 PyObject *unicode, /* Unicode object */ 1048 wchar_t *w, /* wchar_t buffer */ 1049 Py_ssize_t size /* size of buffer */ 1050 ); 1051 1052 /* Convert the Unicode object to a wide character string. The output string 1053 always ends with a nul character. If size is not NULL, write the number of 1054 wide characters (excluding the null character) into *size. 1055 1056 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 1057 on success. On error, returns NULL, *size is undefined and raises a 1058 MemoryError. */ 1059 1060 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 1061 PyObject *unicode, /* Unicode object */ 1062 Py_ssize_t *size /* number of characters of the result */ 1063 ); 1064 1065 #ifndef Py_LIMITED_API 1066 PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); 1067 #endif 1068 1069 #endif 1070 1071 /* --- Unicode ordinals --------------------------------------------------- */ 1072 1073 /* Create a Unicode Object from the given Unicode code point ordinal. 1074 1075 The ordinal must be in range(0x110000). A ValueError is 1076 raised in case it is not. 1077 1078 */ 1079 1080 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 1081 1082 /* --- Free-list management ----------------------------------------------- */ 1083 1084 /* Clear the free list used by the Unicode implementation. 1085 1086 This can be used to release memory used for objects on the free 1087 list back to the Python memory allocator. 1088 1089 */ 1090 1091 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); 1092 1093 /* === Builtin Codecs ===================================================== 1094 1095 Many of these APIs take two arguments encoding and errors. These 1096 parameters encoding and errors have the same semantics as the ones 1097 of the builtin str() API. 1098 1099 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 1100 1101 Error handling is set by errors which may also be set to NULL 1102 meaning to use the default handling defined for the codec. Default 1103 error handling for all builtin codecs is "strict" (ValueErrors are 1104 raised). 1105 1106 The codecs all use a similar interface. Only deviation from the 1107 generic ones are documented. 1108 1109 */ 1110 1111 /* --- Manage the default encoding ---------------------------------------- */ 1112 1113 /* Returns a pointer to the default encoding (UTF-8) of the 1114 Unicode object unicode and the size of the encoded representation 1115 in bytes stored in *size. 1116 1117 In case of an error, no *size is set. 1118 1119 This function caches the UTF-8 encoded string in the unicodeobject 1120 and subsequent calls will return the same string. The memory is released 1121 when the unicodeobject is deallocated. 1122 1123 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to 1124 support the previous internal function with the same behaviour. 1125 1126 *** This API is for interpreter INTERNAL USE ONLY and will likely 1127 *** be removed or changed in the future. 1128 1129 *** If you need to access the Unicode object as UTF-8 bytes string, 1130 *** please use PyUnicode_AsUTF8String() instead. 1131 */ 1132 1133 #ifndef Py_LIMITED_API 1134 PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize( 1135 PyObject *unicode, 1136 Py_ssize_t *size); 1137 #define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize 1138 #endif 1139 1140 /* Returns a pointer to the default encoding (UTF-8) of the 1141 Unicode object unicode. 1142 1143 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation 1144 in the unicodeobject. 1145 1146 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to 1147 support the previous internal function with the same behaviour. 1148 1149 Use of this API is DEPRECATED since no size information can be 1150 extracted from the returned data. 1151 1152 *** This API is for interpreter INTERNAL USE ONLY and will likely 1153 *** be removed or changed for Python 3.1. 1154 1155 *** If you need to access the Unicode object as UTF-8 bytes string, 1156 *** please use PyUnicode_AsUTF8String() instead. 1157 1158 */ 1159 1160 #ifndef Py_LIMITED_API 1161 PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode); 1162 #define _PyUnicode_AsString PyUnicode_AsUTF8 1163 #endif 1164 1165 /* Returns "utf-8". */ 1166 1167 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 1168 1169 /* --- Generic Codecs ----------------------------------------------------- */ 1170 1171 /* Create a Unicode object by decoding the encoded string s of the 1172 given size. */ 1173 1174 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 1175 const char *s, /* encoded string */ 1176 Py_ssize_t size, /* size of buffer */ 1177 const char *encoding, /* encoding */ 1178 const char *errors /* error handling */ 1179 ); 1180 1181 /* Decode a Unicode object unicode and return the result as Python 1182 object. 1183 1184 This API is DEPRECATED. The only supported standard encoding is rot13. 1185 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 1186 that decode from str. */ 1187 1188 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 1189 PyObject *unicode, /* Unicode object */ 1190 const char *encoding, /* encoding */ 1191 const char *errors /* error handling */ 1192 ) Py_DEPRECATED(3.6); 1193 1194 /* Decode a Unicode object unicode and return the result as Unicode 1195 object. 1196 1197 This API is DEPRECATED. The only supported standard encoding is rot13. 1198 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 1199 that decode from str to str. */ 1200 1201 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 1202 PyObject *unicode, /* Unicode object */ 1203 const char *encoding, /* encoding */ 1204 const char *errors /* error handling */ 1205 ) Py_DEPRECATED(3.6); 1206 1207 /* Encodes a Py_UNICODE buffer of the given size and returns a 1208 Python string object. */ 1209 1210 #ifndef Py_LIMITED_API 1211 PyAPI_FUNC(PyObject*) PyUnicode_Encode( 1212 const Py_UNICODE *s, /* Unicode char buffer */ 1213 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ 1214 const char *encoding, /* encoding */ 1215 const char *errors /* error handling */ 1216 ); 1217 #endif 1218 1219 /* Encodes a Unicode object and returns the result as Python 1220 object. 1221 1222 This API is DEPRECATED. It is superceeded by PyUnicode_AsEncodedString() 1223 since all standard encodings (except rot13) encode str to bytes. 1224 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 1225 that encode form str to non-bytes. */ 1226 1227 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 1228 PyObject *unicode, /* Unicode object */ 1229 const char *encoding, /* encoding */ 1230 const char *errors /* error handling */ 1231 ) Py_DEPRECATED(3.6); 1232 1233 /* Encodes a Unicode object and returns the result as Python string 1234 object. */ 1235 1236 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 1237 PyObject *unicode, /* Unicode object */ 1238 const char *encoding, /* encoding */ 1239 const char *errors /* error handling */ 1240 ); 1241 1242 /* Encodes a Unicode object and returns the result as Unicode 1243 object. 1244 1245 This API is DEPRECATED. The only supported standard encodings is rot13. 1246 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 1247 that encode from str to str. */ 1248 1249 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 1250 PyObject *unicode, /* Unicode object */ 1251 const char *encoding, /* encoding */ 1252 const char *errors /* error handling */ 1253 ) Py_DEPRECATED(3.6); 1254 1255 /* Build an encoding map. */ 1256 1257 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 1258 PyObject* string /* 256 character map */ 1259 ); 1260 1261 /* --- UTF-7 Codecs ------------------------------------------------------- */ 1262 1263 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 1264 const char *string, /* UTF-7 encoded string */ 1265 Py_ssize_t length, /* size of string */ 1266 const char *errors /* error handling */ 1267 ); 1268 1269 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 1270 const char *string, /* UTF-7 encoded string */ 1271 Py_ssize_t length, /* size of string */ 1272 const char *errors, /* error handling */ 1273 Py_ssize_t *consumed /* bytes consumed */ 1274 ); 1275 1276 #ifndef Py_LIMITED_API 1277 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( 1278 const Py_UNICODE *data, /* Unicode char buffer */ 1279 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1280 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1281 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1282 const char *errors /* error handling */ 1283 ); 1284 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( 1285 PyObject *unicode, /* Unicode object */ 1286 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ 1287 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ 1288 const char *errors /* error handling */ 1289 ); 1290 #endif 1291 1292 /* --- UTF-8 Codecs ------------------------------------------------------- */ 1293 1294 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 1295 const char *string, /* UTF-8 encoded string */ 1296 Py_ssize_t length, /* size of string */ 1297 const char *errors /* error handling */ 1298 ); 1299 1300 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 1301 const char *string, /* UTF-8 encoded string */ 1302 Py_ssize_t length, /* size of string */ 1303 const char *errors, /* error handling */ 1304 Py_ssize_t *consumed /* bytes consumed */ 1305 ); 1306 1307 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 1308 PyObject *unicode /* Unicode object */ 1309 ); 1310 1311 #ifndef Py_LIMITED_API 1312 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( 1313 PyObject *unicode, 1314 const char *errors); 1315 1316 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( 1317 const Py_UNICODE *data, /* Unicode char buffer */ 1318 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1319 const char *errors /* error handling */ 1320 ); 1321 #endif 1322 1323 /* --- UTF-32 Codecs ------------------------------------------------------ */ 1324 1325 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 1326 the corresponding Unicode object. 1327 1328 errors (if non-NULL) defines the error handling. It defaults 1329 to "strict". 1330 1331 If byteorder is non-NULL, the decoder starts decoding using the 1332 given byte order: 1333 1334 *byteorder == -1: little endian 1335 *byteorder == 0: native order 1336 *byteorder == 1: big endian 1337 1338 In native mode, the first four bytes of the stream are checked for a 1339 BOM mark. If found, the BOM mark is analysed, the byte order 1340 adjusted and the BOM skipped. In the other modes, no BOM mark 1341 interpretation is done. After completion, *byteorder is set to the 1342 current byte order at the end of input data. 1343 1344 If byteorder is NULL, the codec starts in native order mode. 1345 1346 */ 1347 1348 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 1349 const char *string, /* UTF-32 encoded string */ 1350 Py_ssize_t length, /* size of string */ 1351 const char *errors, /* error handling */ 1352 int *byteorder /* pointer to byteorder to use 1353 0=native;-1=LE,1=BE; updated on 1354 exit */ 1355 ); 1356 1357 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 1358 const char *string, /* UTF-32 encoded string */ 1359 Py_ssize_t length, /* size of string */ 1360 const char *errors, /* error handling */ 1361 int *byteorder, /* pointer to byteorder to use 1362 0=native;-1=LE,1=BE; updated on 1363 exit */ 1364 Py_ssize_t *consumed /* bytes consumed */ 1365 ); 1366 1367 /* Returns a Python string using the UTF-32 encoding in native byte 1368 order. The string always starts with a BOM mark. */ 1369 1370 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 1371 PyObject *unicode /* Unicode object */ 1372 ); 1373 1374 /* Returns a Python string object holding the UTF-32 encoded value of 1375 the Unicode data. 1376 1377 If byteorder is not 0, output is written according to the following 1378 byte order: 1379 1380 byteorder == -1: little endian 1381 byteorder == 0: native byte order (writes a BOM mark) 1382 byteorder == 1: big endian 1383 1384 If byteorder is 0, the output string will always start with the 1385 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1386 prepended. 1387 1388 */ 1389 1390 #ifndef Py_LIMITED_API 1391 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( 1392 const Py_UNICODE *data, /* Unicode char buffer */ 1393 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1394 const char *errors, /* error handling */ 1395 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1396 ); 1397 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( 1398 PyObject *object, /* Unicode object */ 1399 const char *errors, /* error handling */ 1400 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1401 ); 1402 #endif 1403 1404 /* --- UTF-16 Codecs ------------------------------------------------------ */ 1405 1406 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 1407 the corresponding Unicode object. 1408 1409 errors (if non-NULL) defines the error handling. It defaults 1410 to "strict". 1411 1412 If byteorder is non-NULL, the decoder starts decoding using the 1413 given byte order: 1414 1415 *byteorder == -1: little endian 1416 *byteorder == 0: native order 1417 *byteorder == 1: big endian 1418 1419 In native mode, the first two bytes of the stream are checked for a 1420 BOM mark. If found, the BOM mark is analysed, the byte order 1421 adjusted and the BOM skipped. In the other modes, no BOM mark 1422 interpretation is done. After completion, *byteorder is set to the 1423 current byte order at the end of input data. 1424 1425 If byteorder is NULL, the codec starts in native order mode. 1426 1427 */ 1428 1429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 1430 const char *string, /* UTF-16 encoded string */ 1431 Py_ssize_t length, /* size of string */ 1432 const char *errors, /* error handling */ 1433 int *byteorder /* pointer to byteorder to use 1434 0=native;-1=LE,1=BE; updated on 1435 exit */ 1436 ); 1437 1438 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 1439 const char *string, /* UTF-16 encoded string */ 1440 Py_ssize_t length, /* size of string */ 1441 const char *errors, /* error handling */ 1442 int *byteorder, /* pointer to byteorder to use 1443 0=native;-1=LE,1=BE; updated on 1444 exit */ 1445 Py_ssize_t *consumed /* bytes consumed */ 1446 ); 1447 1448 /* Returns a Python string using the UTF-16 encoding in native byte 1449 order. The string always starts with a BOM mark. */ 1450 1451 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 1452 PyObject *unicode /* Unicode object */ 1453 ); 1454 1455 /* Returns a Python string object holding the UTF-16 encoded value of 1456 the Unicode data. 1457 1458 If byteorder is not 0, output is written according to the following 1459 byte order: 1460 1461 byteorder == -1: little endian 1462 byteorder == 0: native byte order (writes a BOM mark) 1463 byteorder == 1: big endian 1464 1465 If byteorder is 0, the output string will always start with the 1466 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 1467 prepended. 1468 1469 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to 1470 UCS-2. This trick makes it possible to add full UTF-16 capabilities 1471 at a later point without compromising the APIs. 1472 1473 */ 1474 1475 #ifndef Py_LIMITED_API 1476 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( 1477 const Py_UNICODE *data, /* Unicode char buffer */ 1478 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1479 const char *errors, /* error handling */ 1480 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1481 ); 1482 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( 1483 PyObject* unicode, /* Unicode object */ 1484 const char *errors, /* error handling */ 1485 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ 1486 ); 1487 #endif 1488 1489 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 1490 1491 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 1492 const char *string, /* Unicode-Escape encoded string */ 1493 Py_ssize_t length, /* size of string */ 1494 const char *errors /* error handling */ 1495 ); 1496 1497 #ifndef Py_LIMITED_API 1498 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape 1499 chars. */ 1500 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( 1501 const char *string, /* Unicode-Escape encoded string */ 1502 Py_ssize_t length, /* size of string */ 1503 const char *errors, /* error handling */ 1504 const char **first_invalid_escape /* on return, points to first 1505 invalid escaped char in 1506 string. */ 1507 ); 1508 #endif 1509 1510 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 1511 PyObject *unicode /* Unicode object */ 1512 ); 1513 1514 #ifndef Py_LIMITED_API 1515 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( 1516 const Py_UNICODE *data, /* Unicode char buffer */ 1517 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1518 ); 1519 #endif 1520 1521 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 1522 1523 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 1524 const char *string, /* Raw-Unicode-Escape encoded string */ 1525 Py_ssize_t length, /* size of string */ 1526 const char *errors /* error handling */ 1527 ); 1528 1529 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 1530 PyObject *unicode /* Unicode object */ 1531 ); 1532 1533 #ifndef Py_LIMITED_API 1534 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( 1535 const Py_UNICODE *data, /* Unicode char buffer */ 1536 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ 1537 ); 1538 #endif 1539 1540 /* --- Unicode Internal Codec --------------------------------------------- 1541 1542 Only for internal use in _codecsmodule.c */ 1543 1544 #ifndef Py_LIMITED_API 1545 PyObject *_PyUnicode_DecodeUnicodeInternal( 1546 const char *string, 1547 Py_ssize_t length, 1548 const char *errors 1549 ); 1550 #endif 1551 1552 /* --- Latin-1 Codecs ----------------------------------------------------- 1553 1554 Note: Latin-1 corresponds to the first 256 Unicode ordinals. 1555 1556 */ 1557 1558 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 1559 const char *string, /* Latin-1 encoded string */ 1560 Py_ssize_t length, /* size of string */ 1561 const char *errors /* error handling */ 1562 ); 1563 1564 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 1565 PyObject *unicode /* Unicode object */ 1566 ); 1567 1568 #ifndef Py_LIMITED_API 1569 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( 1570 PyObject* unicode, 1571 const char* errors); 1572 1573 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( 1574 const Py_UNICODE *data, /* Unicode char buffer */ 1575 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1576 const char *errors /* error handling */ 1577 ); 1578 #endif 1579 1580 /* --- ASCII Codecs ------------------------------------------------------- 1581 1582 Only 7-bit ASCII data is excepted. All other codes generate errors. 1583 1584 */ 1585 1586 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 1587 const char *string, /* ASCII encoded string */ 1588 Py_ssize_t length, /* size of string */ 1589 const char *errors /* error handling */ 1590 ); 1591 1592 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 1593 PyObject *unicode /* Unicode object */ 1594 ); 1595 1596 #ifndef Py_LIMITED_API 1597 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( 1598 PyObject* unicode, 1599 const char* errors); 1600 1601 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( 1602 const Py_UNICODE *data, /* Unicode char buffer */ 1603 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1604 const char *errors /* error handling */ 1605 ); 1606 #endif 1607 1608 /* --- Character Map Codecs ----------------------------------------------- 1609 1610 This codec uses mappings to encode and decode characters. 1611 1612 Decoding mappings must map single string characters to single 1613 Unicode characters, integers (which are then interpreted as Unicode 1614 ordinals) or None (meaning "undefined mapping" and causing an 1615 error). 1616 1617 Encoding mappings must map single Unicode characters to single 1618 string characters, integers (which are then interpreted as Latin-1 1619 ordinals) or None (meaning "undefined mapping" and causing an 1620 error). 1621 1622 If a character lookup fails with a LookupError, the character is 1623 copied as-is meaning that its ordinal value will be interpreted as 1624 Unicode or Latin-1 ordinal resp. Because of this mappings only need 1625 to contain those mappings which map characters to different code 1626 points. 1627 1628 */ 1629 1630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 1631 const char *string, /* Encoded string */ 1632 Py_ssize_t length, /* size of string */ 1633 PyObject *mapping, /* character mapping 1634 (char ordinal -> unicode ordinal) */ 1635 const char *errors /* error handling */ 1636 ); 1637 1638 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 1639 PyObject *unicode, /* Unicode object */ 1640 PyObject *mapping /* character mapping 1641 (unicode ordinal -> char ordinal) */ 1642 ); 1643 1644 #ifndef Py_LIMITED_API 1645 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( 1646 const Py_UNICODE *data, /* Unicode char buffer */ 1647 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1648 PyObject *mapping, /* character mapping 1649 (unicode ordinal -> char ordinal) */ 1650 const char *errors /* error handling */ 1651 ); 1652 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( 1653 PyObject *unicode, /* Unicode object */ 1654 PyObject *mapping, /* character mapping 1655 (unicode ordinal -> char ordinal) */ 1656 const char *errors /* error handling */ 1657 ); 1658 #endif 1659 1660 /* Translate a Py_UNICODE buffer of the given length by applying a 1661 character mapping table to it and return the resulting Unicode 1662 object. 1663 1664 The mapping table must map Unicode ordinal integers to Unicode 1665 ordinal integers or None (causing deletion of the character). 1666 1667 Mapping tables may be dictionaries or sequences. Unmapped character 1668 ordinals (ones which cause a LookupError) are left untouched and 1669 are copied as-is. 1670 1671 */ 1672 1673 #ifndef Py_LIMITED_API 1674 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( 1675 const Py_UNICODE *data, /* Unicode char buffer */ 1676 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1677 PyObject *table, /* Translate table */ 1678 const char *errors /* error handling */ 1679 ); 1680 #endif 1681 1682 #ifdef MS_WINDOWS 1683 1684 /* --- MBCS codecs for Windows -------------------------------------------- */ 1685 1686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 1687 const char *string, /* MBCS encoded string */ 1688 Py_ssize_t length, /* size of string */ 1689 const char *errors /* error handling */ 1690 ); 1691 1692 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 1693 const char *string, /* MBCS encoded string */ 1694 Py_ssize_t length, /* size of string */ 1695 const char *errors, /* error handling */ 1696 Py_ssize_t *consumed /* bytes consumed */ 1697 ); 1698 1699 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 1700 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 1701 int code_page, /* code page number */ 1702 const char *string, /* encoded string */ 1703 Py_ssize_t length, /* size of string */ 1704 const char *errors, /* error handling */ 1705 Py_ssize_t *consumed /* bytes consumed */ 1706 ); 1707 #endif 1708 1709 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 1710 PyObject *unicode /* Unicode object */ 1711 ); 1712 1713 #ifndef Py_LIMITED_API 1714 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( 1715 const Py_UNICODE *data, /* Unicode char buffer */ 1716 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ 1717 const char *errors /* error handling */ 1718 ); 1719 #endif 1720 1721 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 1722 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 1723 int code_page, /* code page number */ 1724 PyObject *unicode, /* Unicode object */ 1725 const char *errors /* error handling */ 1726 ); 1727 #endif 1728 1729 #endif /* MS_WINDOWS */ 1730 1731 /* --- Decimal Encoder ---------------------------------------------------- */ 1732 1733 /* Takes a Unicode string holding a decimal value and writes it into 1734 an output buffer using standard ASCII digit codes. 1735 1736 The output buffer has to provide at least length+1 bytes of storage 1737 area. The output string is 0-terminated. 1738 1739 The encoder converts whitespace to ' ', decimal characters to their 1740 corresponding ASCII digit and all other Latin-1 characters except 1741 \0 as-is. Characters outside this range (Unicode ordinals 1-256) 1742 are treated as errors. This includes embedded NULL bytes. 1743 1744 Error handling is defined by the errors argument: 1745 1746 NULL or "strict": raise a ValueError 1747 "ignore": ignore the wrong characters (these are not copied to the 1748 output buffer) 1749 "replace": replaces illegal characters with '?' 1750 1751 Returns 0 on success, -1 on failure. 1752 1753 */ 1754 1755 #ifndef Py_LIMITED_API 1756 PyAPI_FUNC(int) PyUnicode_EncodeDecimal( 1757 Py_UNICODE *s, /* Unicode buffer */ 1758 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ 1759 char *output, /* Output buffer; must have size >= length */ 1760 const char *errors /* error handling */ 1761 ); 1762 #endif 1763 1764 /* Transforms code points that have decimal digit property to the 1765 corresponding ASCII digit code points. 1766 1767 Returns a new Unicode string on success, NULL on failure. 1768 */ 1769 1770 #ifndef Py_LIMITED_API 1771 PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( 1772 Py_UNICODE *s, /* Unicode buffer */ 1773 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ 1774 ); 1775 #endif 1776 1777 /* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject 1778 as argument instead of a raw buffer and length. This function additionally 1779 transforms spaces to ASCII because this is what the callers in longobject, 1780 floatobject, and complexobject did anyways. */ 1781 1782 #ifndef Py_LIMITED_API 1783 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( 1784 PyObject *unicode /* Unicode object */ 1785 ); 1786 #endif 1787 1788 /* --- Locale encoding --------------------------------------------------- */ 1789 1790 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 1791 /* Decode a string from the current locale encoding. The decoder is strict if 1792 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 1793 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 1794 be decoded as a surrogate character and *surrogateescape* is not equal to 1795 zero, the byte sequence is escaped using the 'surrogateescape' error handler 1796 instead of being decoded. *str* must end with a null character but cannot 1797 contain embedded null characters. */ 1798 1799 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 1800 const char *str, 1801 Py_ssize_t len, 1802 const char *errors); 1803 1804 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 1805 length using strlen(). */ 1806 1807 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 1808 const char *str, 1809 const char *errors); 1810 1811 /* Encode a Unicode object to the current locale encoding. The encoder is 1812 strict is *surrogateescape* is equal to zero, otherwise the 1813 "surrogateescape" error handler is used. Return a bytes object. The string 1814 cannot contain embedded null characters. */ 1815 1816 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 1817 PyObject *unicode, 1818 const char *errors 1819 ); 1820 #endif 1821 1822 /* --- File system encoding ---------------------------------------------- */ 1823 1824 /* ParseTuple converter: encode str objects to bytes using 1825 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 1826 1827 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 1828 1829 /* ParseTuple converter: decode bytes objects to unicode using 1830 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 1831 1832 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 1833 1834 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding 1835 and the "surrogateescape" error handler. 1836 1837 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1838 encoding. 1839 1840 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 1841 */ 1842 1843 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 1844 const char *s /* encoded string */ 1845 ); 1846 1847 /* Decode a string using Py_FileSystemDefaultEncoding 1848 and the "surrogateescape" error handler. 1849 1850 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1851 encoding. 1852 */ 1853 1854 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 1855 const char *s, /* encoded string */ 1856 Py_ssize_t size /* size */ 1857 ); 1858 1859 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 1860 "surrogateescape" error handler, and return bytes. 1861 1862 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 1863 encoding. 1864 */ 1865 1866 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 1867 PyObject *unicode 1868 ); 1869 1870 /* --- Methods & Slots ---------------------------------------------------- 1871 1872 These are capable of handling Unicode objects and strings on input 1873 (we refer to them as strings in the descriptions) and return 1874 Unicode objects or integers as appropriate. */ 1875 1876 /* Concat two strings giving a new Unicode string. */ 1877 1878 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 1879 PyObject *left, /* Left string */ 1880 PyObject *right /* Right string */ 1881 ); 1882 1883 /* Concat two strings and put the result in *pleft 1884 (sets *pleft to NULL on error) */ 1885 1886 PyAPI_FUNC(void) PyUnicode_Append( 1887 PyObject **pleft, /* Pointer to left string */ 1888 PyObject *right /* Right string */ 1889 ); 1890 1891 /* Concat two strings, put the result in *pleft and drop the right object 1892 (sets *pleft to NULL on error) */ 1893 1894 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 1895 PyObject **pleft, /* Pointer to left string */ 1896 PyObject *right /* Right string */ 1897 ); 1898 1899 /* Split a string giving a list of Unicode strings. 1900 1901 If sep is NULL, splitting will be done at all whitespace 1902 substrings. Otherwise, splits occur at the given separator. 1903 1904 At most maxsplit splits will be done. If negative, no limit is set. 1905 1906 Separators are not included in the resulting list. 1907 1908 */ 1909 1910 PyAPI_FUNC(PyObject*) PyUnicode_Split( 1911 PyObject *s, /* String to split */ 1912 PyObject *sep, /* String separator */ 1913 Py_ssize_t maxsplit /* Maxsplit count */ 1914 ); 1915 1916 /* Dito, but split at line breaks. 1917 1918 CRLF is considered to be one line break. Line breaks are not 1919 included in the resulting list. */ 1920 1921 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 1922 PyObject *s, /* String to split */ 1923 int keepends /* If true, line end markers are included */ 1924 ); 1925 1926 /* Partition a string using a given separator. */ 1927 1928 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 1929 PyObject *s, /* String to partition */ 1930 PyObject *sep /* String separator */ 1931 ); 1932 1933 /* Partition a string using a given separator, searching from the end of the 1934 string. */ 1935 1936 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 1937 PyObject *s, /* String to partition */ 1938 PyObject *sep /* String separator */ 1939 ); 1940 1941 /* Split a string giving a list of Unicode strings. 1942 1943 If sep is NULL, splitting will be done at all whitespace 1944 substrings. Otherwise, splits occur at the given separator. 1945 1946 At most maxsplit splits will be done. But unlike PyUnicode_Split 1947 PyUnicode_RSplit splits from the end of the string. If negative, 1948 no limit is set. 1949 1950 Separators are not included in the resulting list. 1951 1952 */ 1953 1954 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 1955 PyObject *s, /* String to split */ 1956 PyObject *sep, /* String separator */ 1957 Py_ssize_t maxsplit /* Maxsplit count */ 1958 ); 1959 1960 /* Translate a string by applying a character mapping table to it and 1961 return the resulting Unicode object. 1962 1963 The mapping table must map Unicode ordinal integers to Unicode 1964 ordinal integers or None (causing deletion of the character). 1965 1966 Mapping tables may be dictionaries or sequences. Unmapped character 1967 ordinals (ones which cause a LookupError) are left untouched and 1968 are copied as-is. 1969 1970 */ 1971 1972 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 1973 PyObject *str, /* String */ 1974 PyObject *table, /* Translate table */ 1975 const char *errors /* error handling */ 1976 ); 1977 1978 /* Join a sequence of strings using the given separator and return 1979 the resulting Unicode string. */ 1980 1981 PyAPI_FUNC(PyObject*) PyUnicode_Join( 1982 PyObject *separator, /* Separator string */ 1983 PyObject *seq /* Sequence object */ 1984 ); 1985 1986 #ifndef Py_LIMITED_API 1987 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray( 1988 PyObject *separator, 1989 PyObject **items, 1990 Py_ssize_t seqlen 1991 ); 1992 #endif /* Py_LIMITED_API */ 1993 1994 /* Return 1 if substr matches str[start:end] at the given tail end, 0 1995 otherwise. */ 1996 1997 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 1998 PyObject *str, /* String */ 1999 PyObject *substr, /* Prefix or Suffix string */ 2000 Py_ssize_t start, /* Start index */ 2001 Py_ssize_t end, /* Stop index */ 2002 int direction /* Tail end: -1 prefix, +1 suffix */ 2003 ); 2004 2005 /* Return the first position of substr in str[start:end] using the 2006 given search direction or -1 if not found. -2 is returned in case 2007 an error occurred and an exception is set. */ 2008 2009 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 2010 PyObject *str, /* String */ 2011 PyObject *substr, /* Substring to find */ 2012 Py_ssize_t start, /* Start index */ 2013 Py_ssize_t end, /* Stop index */ 2014 int direction /* Find direction: +1 forward, -1 backward */ 2015 ); 2016 2017 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 2018 /* Like PyUnicode_Find, but search for single character only. */ 2019 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 2020 PyObject *str, 2021 Py_UCS4 ch, 2022 Py_ssize_t start, 2023 Py_ssize_t end, 2024 int direction 2025 ); 2026 #endif 2027 2028 /* Count the number of occurrences of substr in str[start:end]. */ 2029 2030 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 2031 PyObject *str, /* String */ 2032 PyObject *substr, /* Substring to count */ 2033 Py_ssize_t start, /* Start index */ 2034 Py_ssize_t end /* Stop index */ 2035 ); 2036 2037 /* Replace at most maxcount occurrences of substr in str with replstr 2038 and return the resulting Unicode object. */ 2039 2040 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 2041 PyObject *str, /* String */ 2042 PyObject *substr, /* Substring to find */ 2043 PyObject *replstr, /* Substring to replace */ 2044 Py_ssize_t maxcount /* Max. number of replacements to apply; 2045 -1 = all */ 2046 ); 2047 2048 /* Compare two strings and return -1, 0, 1 for less than, equal, 2049 greater than resp. 2050 Raise an exception and return -1 on error. */ 2051 2052 PyAPI_FUNC(int) PyUnicode_Compare( 2053 PyObject *left, /* Left string */ 2054 PyObject *right /* Right string */ 2055 ); 2056 2057 #ifndef Py_LIMITED_API 2058 /* Test whether a unicode is equal to ASCII identifier. Return 1 if true, 2059 0 otherwise. The right argument must be ASCII identifier. 2060 Any error occurs inside will be cleared before return. */ 2061 2062 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId( 2063 PyObject *left, /* Left string */ 2064 _Py_Identifier *right /* Right identifier */ 2065 ); 2066 #endif 2067 2068 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 2069 equal, and greater than, respectively. It is best to pass only 2070 ASCII-encoded strings, but the function interprets the input string as 2071 ISO-8859-1 if it contains non-ASCII characters. 2072 This function does not raise exceptions. */ 2073 2074 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 2075 PyObject *left, 2076 const char *right /* ASCII-encoded string */ 2077 ); 2078 2079 #ifndef Py_LIMITED_API 2080 /* Test whether a unicode is equal to ASCII string. Return 1 if true, 2081 0 otherwise. The right argument must be ASCII-encoded string. 2082 Any error occurs inside will be cleared before return. */ 2083 2084 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString( 2085 PyObject *left, 2086 const char *right /* ASCII-encoded string */ 2087 ); 2088 #endif 2089 2090 /* Rich compare two strings and return one of the following: 2091 2092 - NULL in case an exception was raised 2093 - Py_True or Py_False for successful comparisons 2094 - Py_NotImplemented in case the type combination is unknown 2095 2096 Possible values for op: 2097 2098 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 2099 2100 */ 2101 2102 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 2103 PyObject *left, /* Left string */ 2104 PyObject *right, /* Right string */ 2105 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 2106 ); 2107 2108 /* Apply an argument tuple or dictionary to a format string and return 2109 the resulting Unicode string. */ 2110 2111 PyAPI_FUNC(PyObject *) PyUnicode_Format( 2112 PyObject *format, /* Format string */ 2113 PyObject *args /* Argument tuple or dictionary */ 2114 ); 2115 2116 /* Checks whether element is contained in container and return 1/0 2117 accordingly. 2118 2119 element has to coerce to a one element Unicode string. -1 is 2120 returned in case of an error. */ 2121 2122 PyAPI_FUNC(int) PyUnicode_Contains( 2123 PyObject *container, /* Container string */ 2124 PyObject *element /* Element string */ 2125 ); 2126 2127 /* Checks whether argument is a valid identifier. */ 2128 2129 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 2130 2131 #ifndef Py_LIMITED_API 2132 /* Externally visible for str.strip(unicode) */ 2133 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( 2134 PyObject *self, 2135 int striptype, 2136 PyObject *sepobj 2137 ); 2138 #endif 2139 2140 /* Using explicit passed-in values, insert the thousands grouping 2141 into the string pointed to by buffer. For the argument descriptions, 2142 see Objects/stringlib/localeutil.h */ 2143 #ifndef Py_LIMITED_API 2144 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( 2145 PyObject *unicode, 2146 Py_ssize_t index, 2147 Py_ssize_t n_buffer, 2148 void *digits, 2149 Py_ssize_t n_digits, 2150 Py_ssize_t min_width, 2151 const char *grouping, 2152 PyObject *thousands_sep, 2153 Py_UCS4 *maxchar); 2154 #endif 2155 /* === Characters Type APIs =============================================== */ 2156 2157 /* Helper array used by Py_UNICODE_ISSPACE(). */ 2158 2159 #ifndef Py_LIMITED_API 2160 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; 2161 2162 /* These should not be used directly. Use the Py_UNICODE_IS* and 2163 Py_UNICODE_TO* macros instead. 2164 2165 These APIs are implemented in Objects/unicodectype.c. 2166 2167 */ 2168 2169 PyAPI_FUNC(int) _PyUnicode_IsLowercase( 2170 Py_UCS4 ch /* Unicode character */ 2171 ); 2172 2173 PyAPI_FUNC(int) _PyUnicode_IsUppercase( 2174 Py_UCS4 ch /* Unicode character */ 2175 ); 2176 2177 PyAPI_FUNC(int) _PyUnicode_IsTitlecase( 2178 Py_UCS4 ch /* Unicode character */ 2179 ); 2180 2181 PyAPI_FUNC(int) _PyUnicode_IsXidStart( 2182 Py_UCS4 ch /* Unicode character */ 2183 ); 2184 2185 PyAPI_FUNC(int) _PyUnicode_IsXidContinue( 2186 Py_UCS4 ch /* Unicode character */ 2187 ); 2188 2189 PyAPI_FUNC(int) _PyUnicode_IsWhitespace( 2190 const Py_UCS4 ch /* Unicode character */ 2191 ); 2192 2193 PyAPI_FUNC(int) _PyUnicode_IsLinebreak( 2194 const Py_UCS4 ch /* Unicode character */ 2195 ); 2196 2197 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( 2198 Py_UCS4 ch /* Unicode character */ 2199 ); 2200 2201 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( 2202 Py_UCS4 ch /* Unicode character */ 2203 ); 2204 2205 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( 2206 Py_UCS4 ch /* Unicode character */ 2207 ); 2208 2209 PyAPI_FUNC(int) _PyUnicode_ToLowerFull( 2210 Py_UCS4 ch, /* Unicode character */ 2211 Py_UCS4 *res 2212 ); 2213 2214 PyAPI_FUNC(int) _PyUnicode_ToTitleFull( 2215 Py_UCS4 ch, /* Unicode character */ 2216 Py_UCS4 *res 2217 ); 2218 2219 PyAPI_FUNC(int) _PyUnicode_ToUpperFull( 2220 Py_UCS4 ch, /* Unicode character */ 2221 Py_UCS4 *res 2222 ); 2223 2224 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( 2225 Py_UCS4 ch, /* Unicode character */ 2226 Py_UCS4 *res 2227 ); 2228 2229 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( 2230 Py_UCS4 ch /* Unicode character */ 2231 ); 2232 2233 PyAPI_FUNC(int) _PyUnicode_IsCased( 2234 Py_UCS4 ch /* Unicode character */ 2235 ); 2236 2237 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( 2238 Py_UCS4 ch /* Unicode character */ 2239 ); 2240 2241 PyAPI_FUNC(int) _PyUnicode_ToDigit( 2242 Py_UCS4 ch /* Unicode character */ 2243 ); 2244 2245 PyAPI_FUNC(double) _PyUnicode_ToNumeric( 2246 Py_UCS4 ch /* Unicode character */ 2247 ); 2248 2249 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( 2250 Py_UCS4 ch /* Unicode character */ 2251 ); 2252 2253 PyAPI_FUNC(int) _PyUnicode_IsDigit( 2254 Py_UCS4 ch /* Unicode character */ 2255 ); 2256 2257 PyAPI_FUNC(int) _PyUnicode_IsNumeric( 2258 Py_UCS4 ch /* Unicode character */ 2259 ); 2260 2261 PyAPI_FUNC(int) _PyUnicode_IsPrintable( 2262 Py_UCS4 ch /* Unicode character */ 2263 ); 2264 2265 PyAPI_FUNC(int) _PyUnicode_IsAlpha( 2266 Py_UCS4 ch /* Unicode character */ 2267 ); 2268 2269 PyAPI_FUNC(size_t) Py_UNICODE_strlen( 2270 const Py_UNICODE *u 2271 ); 2272 2273 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( 2274 Py_UNICODE *s1, 2275 const Py_UNICODE *s2); 2276 2277 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( 2278 Py_UNICODE *s1, const Py_UNICODE *s2); 2279 2280 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( 2281 Py_UNICODE *s1, 2282 const Py_UNICODE *s2, 2283 size_t n); 2284 2285 PyAPI_FUNC(int) Py_UNICODE_strcmp( 2286 const Py_UNICODE *s1, 2287 const Py_UNICODE *s2 2288 ); 2289 2290 PyAPI_FUNC(int) Py_UNICODE_strncmp( 2291 const Py_UNICODE *s1, 2292 const Py_UNICODE *s2, 2293 size_t n 2294 ); 2295 2296 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( 2297 const Py_UNICODE *s, 2298 Py_UNICODE c 2299 ); 2300 2301 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( 2302 const Py_UNICODE *s, 2303 Py_UNICODE c 2304 ); 2305 2306 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); 2307 2308 /* Create a copy of a unicode string ending with a nul character. Return NULL 2309 and raise a MemoryError exception on memory allocation failure, otherwise 2310 return a new allocated buffer (use PyMem_Free() to free the buffer). */ 2311 2312 PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( 2313 PyObject *unicode 2314 ); 2315 #endif /* Py_LIMITED_API */ 2316 2317 #if defined(Py_DEBUG) && !defined(Py_LIMITED_API) 2318 PyAPI_FUNC(int) _PyUnicode_CheckConsistency( 2319 PyObject *op, 2320 int check_content); 2321 #endif 2322 2323 #ifndef Py_LIMITED_API 2324 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ 2325 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); 2326 /* Clear all static strings. */ 2327 PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); 2328 2329 /* Fast equality check when the inputs are known to be exact unicode types 2330 and where the hash values are equal (i.e. a very probable match) */ 2331 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); 2332 #endif /* !Py_LIMITED_API */ 2333 2334 #ifdef __cplusplus 2335 } 2336 #endif 2337 #endif /* !Py_UNICODEOBJECT_H */ 2338