1""" Test script for the Unicode implementation. 2 3Written by Marc-Andre Lemburg (mal@lemburg.com). 4 5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8import _string 9import codecs 10import itertools 11import operator 12import struct 13import sys 14import textwrap 15import unicodedata 16import unittest 17import warnings 18from test.support import import_helper 19from test.support import warnings_helper 20from test import support, string_tests 21from test.support.script_helper import assert_python_failure 22 23# Error handling (bad decoder return) 24def search_function(encoding): 25 def decode1(input, errors="strict"): 26 return 42 # not a tuple 27 def encode1(input, errors="strict"): 28 return 42 # not a tuple 29 def encode2(input, errors="strict"): 30 return (42, 42) # no unicode 31 def decode2(input, errors="strict"): 32 return (42, 42) # no unicode 33 if encoding=="test.unicode1": 34 return (encode1, decode1, None, None) 35 elif encoding=="test.unicode2": 36 return (encode2, decode2, None, None) 37 else: 38 return None 39 40def duplicate_string(text): 41 """ 42 Try to get a fresh clone of the specified text: 43 new object with a reference count of 1. 44 45 This is a best-effort: latin1 single letters and the empty 46 string ('') are singletons and cannot be cloned. 47 """ 48 return text.encode().decode() 49 50class StrSubclass(str): 51 pass 52 53class UnicodeTest(string_tests.CommonTest, 54 string_tests.MixinStrUnicodeUserStringTest, 55 string_tests.MixinStrUnicodeTest, 56 unittest.TestCase): 57 58 type2test = str 59 60 def setUp(self): 61 codecs.register(search_function) 62 self.addCleanup(codecs.unregister, search_function) 63 64 def checkequalnofix(self, result, object, methodname, *args): 65 method = getattr(object, methodname) 66 realresult = method(*args) 67 self.assertEqual(realresult, result) 68 self.assertTrue(type(realresult) is type(result)) 69 70 # if the original is returned make sure that 71 # this doesn't happen with subclasses 72 if realresult is object: 73 class usub(str): 74 def __repr__(self): 75 return 'usub(%r)' % str.__repr__(self) 76 object = usub(object) 77 method = getattr(object, methodname) 78 realresult = method(*args) 79 self.assertEqual(realresult, result) 80 self.assertTrue(object is not realresult) 81 82 def test_literals(self): 83 self.assertEqual('\xff', '\u00ff') 84 self.assertEqual('\uffff', '\U0000ffff') 85 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'') 86 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'') 87 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000) 88 # raw strings should not have unicode escapes 89 self.assertNotEqual(r"\u0020", " ") 90 91 def test_ascii(self): 92 if not sys.platform.startswith('java'): 93 # Test basic sanity of repr() 94 self.assertEqual(ascii('abc'), "'abc'") 95 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'") 96 self.assertEqual(ascii('ab\\'), "'ab\\\\'") 97 self.assertEqual(ascii('\\c'), "'\\\\c'") 98 self.assertEqual(ascii('\\'), "'\\\\'") 99 self.assertEqual(ascii('\n'), "'\\n'") 100 self.assertEqual(ascii('\r'), "'\\r'") 101 self.assertEqual(ascii('\t'), "'\\t'") 102 self.assertEqual(ascii('\b'), "'\\x08'") 103 self.assertEqual(ascii("'\""), """'\\'"'""") 104 self.assertEqual(ascii("'\""), """'\\'"'""") 105 self.assertEqual(ascii("'"), '''"'"''') 106 self.assertEqual(ascii('"'), """'"'""") 107 latin1repr = ( 108 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 109 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 110 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 111 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 112 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 113 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 114 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" 115 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" 116 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" 117 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" 118 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" 119 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" 120 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" 121 "\\xfe\\xff'") 122 testrepr = ascii(''.join(map(chr, range(256)))) 123 self.assertEqual(testrepr, latin1repr) 124 # Test ascii works on wide unicode escapes without overflow. 125 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096), 126 ascii("\U00010000" * 39 + "\uffff" * 4096)) 127 128 class WrongRepr: 129 def __repr__(self): 130 return b'byte-repr' 131 self.assertRaises(TypeError, ascii, WrongRepr()) 132 133 def test_repr(self): 134 if not sys.platform.startswith('java'): 135 # Test basic sanity of repr() 136 self.assertEqual(repr('abc'), "'abc'") 137 self.assertEqual(repr('ab\\c'), "'ab\\\\c'") 138 self.assertEqual(repr('ab\\'), "'ab\\\\'") 139 self.assertEqual(repr('\\c'), "'\\\\c'") 140 self.assertEqual(repr('\\'), "'\\\\'") 141 self.assertEqual(repr('\n'), "'\\n'") 142 self.assertEqual(repr('\r'), "'\\r'") 143 self.assertEqual(repr('\t'), "'\\t'") 144 self.assertEqual(repr('\b'), "'\\x08'") 145 self.assertEqual(repr("'\""), """'\\'"'""") 146 self.assertEqual(repr("'\""), """'\\'"'""") 147 self.assertEqual(repr("'"), '''"'"''') 148 self.assertEqual(repr('"'), """'"'""") 149 latin1repr = ( 150 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 151 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 152 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 153 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 154 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 155 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 156 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9" 157 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" 158 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5" 159 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3" 160 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1" 161 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" 162 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd" 163 "\xfe\xff'") 164 testrepr = repr(''.join(map(chr, range(256)))) 165 self.assertEqual(testrepr, latin1repr) 166 # Test repr works on wide unicode escapes without overflow. 167 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096), 168 repr("\U00010000" * 39 + "\uffff" * 4096)) 169 170 class WrongRepr: 171 def __repr__(self): 172 return b'byte-repr' 173 self.assertRaises(TypeError, repr, WrongRepr()) 174 175 def test_iterators(self): 176 # Make sure unicode objects have an __iter__ method 177 it = "\u1111\u2222\u3333".__iter__() 178 self.assertEqual(next(it), "\u1111") 179 self.assertEqual(next(it), "\u2222") 180 self.assertEqual(next(it), "\u3333") 181 self.assertRaises(StopIteration, next, it) 182 183 def test_count(self): 184 string_tests.CommonTest.test_count(self) 185 # check mixed argument types 186 self.checkequalnofix(3, 'aaa', 'count', 'a') 187 self.checkequalnofix(0, 'aaa', 'count', 'b') 188 self.checkequalnofix(3, 'aaa', 'count', 'a') 189 self.checkequalnofix(0, 'aaa', 'count', 'b') 190 self.checkequalnofix(0, 'aaa', 'count', 'b') 191 self.checkequalnofix(1, 'aaa', 'count', 'a', -1) 192 self.checkequalnofix(3, 'aaa', 'count', 'a', -10) 193 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1) 194 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10) 195 # test mixed kinds 196 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a') 197 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a') 198 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102') 199 self.checkequal(0, 'a' * 10, 'count', '\u0102') 200 self.checkequal(0, 'a' * 10, 'count', '\U00100304') 201 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304') 202 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_') 203 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_') 204 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_') 205 self.checkequal(0, 'a' * 10, 'count', 'a\u0102') 206 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304') 207 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304') 208 209 def test_find(self): 210 string_tests.CommonTest.test_find(self) 211 # test implementation details of the memchr fast path 212 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102') 213 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201') 214 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120') 215 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220') 216 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304') 217 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204') 218 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004') 219 # check mixed argument types 220 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc') 221 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1) 222 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4) 223 224 self.assertRaises(TypeError, 'hello'.find) 225 self.assertRaises(TypeError, 'hello'.find, 42) 226 # test mixed kinds 227 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a') 228 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a') 229 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102') 230 self.checkequal(-1, 'a' * 100, 'find', '\u0102') 231 self.checkequal(-1, 'a' * 100, 'find', '\U00100304') 232 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304') 233 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_') 234 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_') 235 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_') 236 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102') 237 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304') 238 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304') 239 240 def test_rfind(self): 241 string_tests.CommonTest.test_rfind(self) 242 # test implementation details of the memrchr fast path 243 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102') 244 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201') 245 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120') 246 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220') 247 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304') 248 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204') 249 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004') 250 # check mixed argument types 251 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc') 252 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 253 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 254 # test mixed kinds 255 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a') 256 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a') 257 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102') 258 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102') 259 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304') 260 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304') 261 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a') 262 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a') 263 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102') 264 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a') 265 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a') 266 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102') 267 268 def test_index(self): 269 string_tests.CommonTest.test_index(self) 270 self.checkequalnofix(0, 'abcdefghiabc', 'index', '') 271 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def') 272 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc') 273 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1) 274 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib') 275 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1) 276 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8) 277 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1) 278 # test mixed kinds 279 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a') 280 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a') 281 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102') 282 self.assertRaises(ValueError, ('a' * 100).index, '\u0102') 283 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304') 284 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304') 285 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_') 286 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_') 287 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_') 288 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102') 289 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304') 290 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304') 291 292 def test_rindex(self): 293 string_tests.CommonTest.test_rindex(self) 294 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '') 295 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def') 296 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc') 297 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1) 298 299 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib') 300 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1) 301 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1) 302 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8) 303 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1) 304 # test mixed kinds 305 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a') 306 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a') 307 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102') 308 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102') 309 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304') 310 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304') 311 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a') 312 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a') 313 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102') 314 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a') 315 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a') 316 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102') 317 318 def test_maketrans_translate(self): 319 # these work with plain translate() 320 self.checkequalnofix('bbbc', 'abababc', 'translate', 321 {ord('a'): None}) 322 self.checkequalnofix('iiic', 'abababc', 'translate', 323 {ord('a'): None, ord('b'): ord('i')}) 324 self.checkequalnofix('iiix', 'abababc', 'translate', 325 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'}) 326 self.checkequalnofix('c', 'abababc', 'translate', 327 {ord('a'): None, ord('b'): ''}) 328 self.checkequalnofix('xyyx', 'xzx', 'translate', 329 {ord('z'): 'yy'}) 330 331 # this needs maketrans() 332 self.checkequalnofix('abababc', 'abababc', 'translate', 333 {'b': '<i>'}) 334 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'}) 335 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl) 336 # test alternative way of calling maketrans() 337 tbl = self.type2test.maketrans('abc', 'xyz', 'd') 338 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl) 339 340 # various tests switching from ASCII to latin1 or the opposite; 341 # same length, remove a letter, or replace with a longer string. 342 self.assertEqual("[a]".translate(str.maketrans('a', 'X')), 343 "[X]") 344 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})), 345 "[X]") 346 self.assertEqual("[a]".translate(str.maketrans({'a': None})), 347 "[]") 348 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), 349 "[XXX]") 350 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), 351 "[\xe9]") 352 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), 353 "x123") 354 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), 355 "x\xe9") 356 357 # test non-ASCII (don't take the fast-path) 358 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), 359 "[<\xe9>]") 360 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})), 361 "[a]") 362 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})), 363 "[]") 364 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})), 365 "[123]") 366 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})), 367 "[<\u20ac>\xe9]") 368 369 # invalid Unicode characters 370 invalid_char = 0x10ffff+1 371 for before in "a\xe9\u20ac\U0010ffff": 372 mapping = str.maketrans({before: invalid_char}) 373 text = "[%s]" % before 374 self.assertRaises(ValueError, text.translate, mapping) 375 376 # errors 377 self.assertRaises(TypeError, self.type2test.maketrans) 378 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg') 379 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def') 380 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2) 381 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2) 382 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2}) 383 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2}) 384 385 self.assertRaises(TypeError, 'hello'.translate) 386 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz') 387 388 def test_split(self): 389 string_tests.CommonTest.test_split(self) 390 391 # test mixed kinds 392 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 393 left *= 9 394 right *= 9 395 for delim in ('c', '\u0102', '\U00010302'): 396 self.checkequal([left + right], 397 left + right, 'split', delim) 398 self.checkequal([left, right], 399 left + delim + right, 'split', delim) 400 self.checkequal([left + right], 401 left + right, 'split', delim * 2) 402 self.checkequal([left, right], 403 left + delim * 2 + right, 'split', delim *2) 404 405 def test_rsplit(self): 406 string_tests.CommonTest.test_rsplit(self) 407 # test mixed kinds 408 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 409 left *= 9 410 right *= 9 411 for delim in ('c', '\u0102', '\U00010302'): 412 self.checkequal([left + right], 413 left + right, 'rsplit', delim) 414 self.checkequal([left, right], 415 left + delim + right, 'rsplit', delim) 416 self.checkequal([left + right], 417 left + right, 'rsplit', delim * 2) 418 self.checkequal([left, right], 419 left + delim * 2 + right, 'rsplit', delim *2) 420 421 def test_partition(self): 422 string_tests.MixinStrUnicodeUserStringTest.test_partition(self) 423 # test mixed kinds 424 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200') 425 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 426 left *= 9 427 right *= 9 428 for delim in ('c', '\u0102', '\U00010302'): 429 self.checkequal((left + right, '', ''), 430 left + right, 'partition', delim) 431 self.checkequal((left, delim, right), 432 left + delim + right, 'partition', delim) 433 self.checkequal((left + right, '', ''), 434 left + right, 'partition', delim * 2) 435 self.checkequal((left, delim * 2, right), 436 left + delim * 2 + right, 'partition', delim * 2) 437 438 def test_rpartition(self): 439 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self) 440 # test mixed kinds 441 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200') 442 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 443 left *= 9 444 right *= 9 445 for delim in ('c', '\u0102', '\U00010302'): 446 self.checkequal(('', '', left + right), 447 left + right, 'rpartition', delim) 448 self.checkequal((left, delim, right), 449 left + delim + right, 'rpartition', delim) 450 self.checkequal(('', '', left + right), 451 left + right, 'rpartition', delim * 2) 452 self.checkequal((left, delim * 2, right), 453 left + delim * 2 + right, 'rpartition', delim * 2) 454 455 def test_join(self): 456 string_tests.MixinStrUnicodeUserStringTest.test_join(self) 457 458 class MyWrapper: 459 def __init__(self, sval): self.sval = sval 460 def __str__(self): return self.sval 461 462 # mixed arguments 463 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 464 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 465 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 466 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 467 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 468 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 469 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 470 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')]) 471 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()]) 472 self.checkraises(TypeError, ' ', 'join', [1, 2, 3]) 473 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3]) 474 475 @unittest.skipIf(sys.maxsize > 2**32, 476 'needs too much memory on a 64-bit platform') 477 def test_join_overflow(self): 478 size = int(sys.maxsize**0.5) + 1 479 seq = ('A' * size,) * size 480 self.assertRaises(OverflowError, ''.join, seq) 481 482 def test_replace(self): 483 string_tests.CommonTest.test_replace(self) 484 485 # method call forwarded from str implementation because of unicode argument 486 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1) 487 self.assertRaises(TypeError, 'replace'.replace, "r", 42) 488 # test mixed kinds 489 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 490 left *= 9 491 right *= 9 492 for delim in ('c', '\u0102', '\U00010302'): 493 for repl in ('d', '\u0103', '\U00010303'): 494 self.checkequal(left + right, 495 left + right, 'replace', delim, repl) 496 self.checkequal(left + repl + right, 497 left + delim + right, 498 'replace', delim, repl) 499 self.checkequal(left + right, 500 left + right, 'replace', delim * 2, repl) 501 self.checkequal(left + repl + right, 502 left + delim * 2 + right, 503 'replace', delim * 2, repl) 504 505 @support.cpython_only 506 def test_replace_id(self): 507 pattern = 'abc' 508 text = 'abc def' 509 self.assertIs(text.replace(pattern, pattern), text) 510 511 def test_repeat_id_preserving(self): 512 a = '123abc1@' 513 b = '456zyx-+' 514 self.assertEqual(id(a), id(a)) 515 self.assertNotEqual(id(a), id(b)) 516 self.assertNotEqual(id(a), id(a * -4)) 517 self.assertNotEqual(id(a), id(a * 0)) 518 self.assertEqual(id(a), id(a * 1)) 519 self.assertEqual(id(a), id(1 * a)) 520 self.assertNotEqual(id(a), id(a * 2)) 521 522 class SubStr(str): 523 pass 524 525 s = SubStr('qwerty()') 526 self.assertEqual(id(s), id(s)) 527 self.assertNotEqual(id(s), id(s * -4)) 528 self.assertNotEqual(id(s), id(s * 0)) 529 self.assertNotEqual(id(s), id(s * 1)) 530 self.assertNotEqual(id(s), id(1 * s)) 531 self.assertNotEqual(id(s), id(s * 2)) 532 533 def test_bytes_comparison(self): 534 with warnings_helper.check_warnings(): 535 warnings.simplefilter('ignore', BytesWarning) 536 self.assertEqual('abc' == b'abc', False) 537 self.assertEqual('abc' != b'abc', True) 538 self.assertEqual('abc' == bytearray(b'abc'), False) 539 self.assertEqual('abc' != bytearray(b'abc'), True) 540 541 def test_comparison(self): 542 # Comparisons: 543 self.assertEqual('abc', 'abc') 544 self.assertTrue('abcd' > 'abc') 545 self.assertTrue('abc' < 'abcd') 546 547 if 0: 548 # Move these tests to a Unicode collation module test... 549 # Testing UTF-16 code point order comparisons... 550 551 # No surrogates, no fixup required. 552 self.assertTrue('\u0061' < '\u20ac') 553 # Non surrogate below surrogate value, no fixup required 554 self.assertTrue('\u0061' < '\ud800\udc02') 555 556 # Non surrogate above surrogate value, fixup required 557 def test_lecmp(s, s2): 558 self.assertTrue(s < s2) 559 560 def test_fixup(s): 561 s2 = '\ud800\udc01' 562 test_lecmp(s, s2) 563 s2 = '\ud900\udc01' 564 test_lecmp(s, s2) 565 s2 = '\uda00\udc01' 566 test_lecmp(s, s2) 567 s2 = '\udb00\udc01' 568 test_lecmp(s, s2) 569 s2 = '\ud800\udd01' 570 test_lecmp(s, s2) 571 s2 = '\ud900\udd01' 572 test_lecmp(s, s2) 573 s2 = '\uda00\udd01' 574 test_lecmp(s, s2) 575 s2 = '\udb00\udd01' 576 test_lecmp(s, s2) 577 s2 = '\ud800\ude01' 578 test_lecmp(s, s2) 579 s2 = '\ud900\ude01' 580 test_lecmp(s, s2) 581 s2 = '\uda00\ude01' 582 test_lecmp(s, s2) 583 s2 = '\udb00\ude01' 584 test_lecmp(s, s2) 585 s2 = '\ud800\udfff' 586 test_lecmp(s, s2) 587 s2 = '\ud900\udfff' 588 test_lecmp(s, s2) 589 s2 = '\uda00\udfff' 590 test_lecmp(s, s2) 591 s2 = '\udb00\udfff' 592 test_lecmp(s, s2) 593 594 test_fixup('\ue000') 595 test_fixup('\uff61') 596 597 # Surrogates on both sides, no fixup required 598 self.assertTrue('\ud800\udc02' < '\ud84d\udc56') 599 600 def test_islower(self): 601 super().test_islower() 602 self.checkequalnofix(False, '\u1FFc', 'islower') 603 self.assertFalse('\u2167'.islower()) 604 self.assertTrue('\u2177'.islower()) 605 # non-BMP, uppercase 606 self.assertFalse('\U00010401'.islower()) 607 self.assertFalse('\U00010427'.islower()) 608 # non-BMP, lowercase 609 self.assertTrue('\U00010429'.islower()) 610 self.assertTrue('\U0001044E'.islower()) 611 # non-BMP, non-cased 612 self.assertFalse('\U0001F40D'.islower()) 613 self.assertFalse('\U0001F46F'.islower()) 614 615 def test_isupper(self): 616 super().test_isupper() 617 if not sys.platform.startswith('java'): 618 self.checkequalnofix(False, '\u1FFc', 'isupper') 619 self.assertTrue('\u2167'.isupper()) 620 self.assertFalse('\u2177'.isupper()) 621 # non-BMP, uppercase 622 self.assertTrue('\U00010401'.isupper()) 623 self.assertTrue('\U00010427'.isupper()) 624 # non-BMP, lowercase 625 self.assertFalse('\U00010429'.isupper()) 626 self.assertFalse('\U0001044E'.isupper()) 627 # non-BMP, non-cased 628 self.assertFalse('\U0001F40D'.isupper()) 629 self.assertFalse('\U0001F46F'.isupper()) 630 631 def test_istitle(self): 632 super().test_istitle() 633 self.checkequalnofix(True, '\u1FFc', 'istitle') 634 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle') 635 636 # non-BMP, uppercase + lowercase 637 self.assertTrue('\U00010401\U00010429'.istitle()) 638 self.assertTrue('\U00010427\U0001044E'.istitle()) 639 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 640 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: 641 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch)) 642 643 def test_isspace(self): 644 super().test_isspace() 645 self.checkequalnofix(True, '\u2000', 'isspace') 646 self.checkequalnofix(True, '\u200a', 'isspace') 647 self.checkequalnofix(False, '\u2014', 'isspace') 648 # There are no non-BMP whitespace chars as of Unicode 12. 649 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 650 '\U0001F40D', '\U0001F46F']: 651 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) 652 653 @support.requires_resource('cpu') 654 def test_isspace_invariant(self): 655 for codepoint in range(sys.maxunicode + 1): 656 char = chr(codepoint) 657 bidirectional = unicodedata.bidirectional(char) 658 category = unicodedata.category(char) 659 self.assertEqual(char.isspace(), 660 (bidirectional in ('WS', 'B', 'S') 661 or category == 'Zs')) 662 663 def test_isalnum(self): 664 super().test_isalnum() 665 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 666 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 667 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch)) 668 669 def test_isalpha(self): 670 super().test_isalpha() 671 self.checkequalnofix(True, '\u1FFc', 'isalpha') 672 # non-BMP, cased 673 self.assertTrue('\U00010401'.isalpha()) 674 self.assertTrue('\U00010427'.isalpha()) 675 self.assertTrue('\U00010429'.isalpha()) 676 self.assertTrue('\U0001044E'.isalpha()) 677 # non-BMP, non-cased 678 self.assertFalse('\U0001F40D'.isalpha()) 679 self.assertFalse('\U0001F46F'.isalpha()) 680 681 def test_isascii(self): 682 super().test_isascii() 683 self.assertFalse("\u20ac".isascii()) 684 self.assertFalse("\U0010ffff".isascii()) 685 686 def test_isdecimal(self): 687 self.checkequalnofix(False, '', 'isdecimal') 688 self.checkequalnofix(False, 'a', 'isdecimal') 689 self.checkequalnofix(True, '0', 'isdecimal') 690 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE 691 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER 692 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO 693 self.checkequalnofix(True, '0123456789', 'isdecimal') 694 self.checkequalnofix(False, '0123456789a', 'isdecimal') 695 696 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 697 698 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 699 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']: 700 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch)) 701 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']: 702 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) 703 704 def test_isdigit(self): 705 super().test_isdigit() 706 self.checkequalnofix(True, '\u2460', 'isdigit') 707 self.checkequalnofix(False, '\xbc', 'isdigit') 708 self.checkequalnofix(True, '\u0660', 'isdigit') 709 710 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 711 '\U0001F40D', '\U0001F46F', '\U00011065']: 712 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch)) 713 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 714 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch)) 715 716 def test_isnumeric(self): 717 self.checkequalnofix(False, '', 'isnumeric') 718 self.checkequalnofix(False, 'a', 'isnumeric') 719 self.checkequalnofix(True, '0', 'isnumeric') 720 self.checkequalnofix(True, '\u2460', 'isnumeric') 721 self.checkequalnofix(True, '\xbc', 'isnumeric') 722 self.checkequalnofix(True, '\u0660', 'isnumeric') 723 self.checkequalnofix(True, '0123456789', 'isnumeric') 724 self.checkequalnofix(False, '0123456789a', 'isnumeric') 725 726 self.assertRaises(TypeError, "abc".isnumeric, 42) 727 728 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 729 '\U0001F40D', '\U0001F46F']: 730 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch)) 731 for ch in ['\U00011065', '\U0001D7F6', '\U00011066', 732 '\U000104A0', '\U0001F107']: 733 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch)) 734 735 def test_isidentifier(self): 736 self.assertTrue("a".isidentifier()) 737 self.assertTrue("Z".isidentifier()) 738 self.assertTrue("_".isidentifier()) 739 self.assertTrue("b0".isidentifier()) 740 self.assertTrue("bc".isidentifier()) 741 self.assertTrue("b_".isidentifier()) 742 self.assertTrue("µ".isidentifier()) 743 self.assertTrue("".isidentifier()) 744 745 self.assertFalse(" ".isidentifier()) 746 self.assertFalse("[".isidentifier()) 747 self.assertFalse("©".isidentifier()) 748 self.assertFalse("0".isidentifier()) 749 750 @support.cpython_only 751 @support.requires_legacy_unicode_capi 752 def test_isidentifier_legacy(self): 753 import _testcapi 754 u = '' 755 self.assertTrue(u.isidentifier()) 756 with warnings_helper.check_warnings(): 757 warnings.simplefilter('ignore', DeprecationWarning) 758 self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) 759 760 def test_isprintable(self): 761 self.assertTrue("".isprintable()) 762 self.assertTrue(" ".isprintable()) 763 self.assertTrue("abcdefg".isprintable()) 764 self.assertFalse("abcdefg\n".isprintable()) 765 # some defined Unicode character 766 self.assertTrue("\u0374".isprintable()) 767 # undefined character 768 self.assertFalse("\u0378".isprintable()) 769 # single surrogate character 770 self.assertFalse("\ud800".isprintable()) 771 772 self.assertTrue('\U0001F46F'.isprintable()) 773 self.assertFalse('\U000E0020'.isprintable()) 774 775 def test_surrogates(self): 776 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', 777 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 778 self.assertTrue(s.islower()) 779 self.assertFalse(s.isupper()) 780 self.assertFalse(s.istitle()) 781 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800', 782 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'): 783 self.assertFalse(s.islower()) 784 self.assertTrue(s.isupper()) 785 self.assertTrue(s.istitle()) 786 787 for meth_name in ('islower', 'isupper', 'istitle'): 788 meth = getattr(str, meth_name) 789 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'): 790 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 791 792 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', 793 'isdecimal', 'isnumeric', 794 'isidentifier', 'isprintable'): 795 meth = getattr(str, meth_name) 796 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 797 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 798 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 799 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 800 801 802 def test_lower(self): 803 string_tests.CommonTest.test_lower(self) 804 self.assertEqual('\U00010427'.lower(), '\U0001044F') 805 self.assertEqual('\U00010427\U00010427'.lower(), 806 '\U0001044F\U0001044F') 807 self.assertEqual('\U00010427\U0001044F'.lower(), 808 '\U0001044F\U0001044F') 809 self.assertEqual('X\U00010427x\U0001044F'.lower(), 810 'x\U0001044Fx\U0001044F') 811 self.assertEqual('fi'.lower(), 'fi') 812 self.assertEqual('\u0130'.lower(), '\u0069\u0307') 813 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 814 self.assertEqual('\u03a3'.lower(), '\u03c3') 815 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3') 816 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 817 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a') 818 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 819 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345') 820 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ') 821 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') 822 self.assertEqual('\u2177'.lower(), '\u2177') 823 824 def test_casefold(self): 825 self.assertEqual('hello'.casefold(), 'hello') 826 self.assertEqual('hELlo'.casefold(), 'hello') 827 self.assertEqual('ß'.casefold(), 'ss') 828 self.assertEqual('fi'.casefold(), 'fi') 829 self.assertEqual('\u03a3'.casefold(), '\u03c3') 830 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3') 831 self.assertEqual('\u00b5'.casefold(), '\u03bc') 832 833 def test_upper(self): 834 string_tests.CommonTest.test_upper(self) 835 self.assertEqual('\U0001044F'.upper(), '\U00010427') 836 self.assertEqual('\U0001044F\U0001044F'.upper(), 837 '\U00010427\U00010427') 838 self.assertEqual('\U00010427\U0001044F'.upper(), 839 '\U00010427\U00010427') 840 self.assertEqual('X\U00010427x\U0001044F'.upper(), 841 'X\U00010427X\U00010427') 842 self.assertEqual('fi'.upper(), 'FI') 843 self.assertEqual('\u0130'.upper(), '\u0130') 844 self.assertEqual('\u03a3'.upper(), '\u03a3') 845 self.assertEqual('ß'.upper(), 'SS') 846 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300') 847 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') 848 self.assertEqual('\u2177'.upper(), '\u2167') 849 850 def test_capitalize(self): 851 string_tests.CommonTest.test_capitalize(self) 852 self.assertEqual('\U0001044F'.capitalize(), '\U00010427') 853 self.assertEqual('\U0001044F\U0001044F'.capitalize(), 854 '\U00010427\U0001044F') 855 self.assertEqual('\U00010427\U0001044F'.capitalize(), 856 '\U00010427\U0001044F') 857 self.assertEqual('\U0001044F\U00010427'.capitalize(), 858 '\U00010427\U0001044F') 859 self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 860 'X\U0001044Fx\U0001044F') 861 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307') 862 exp = '\u0399\u0308\u0300\u0069\u0307' 863 self.assertEqual('\u1fd2\u0130'.capitalize(), exp) 864 self.assertEqual('finnish'.capitalize(), 'Finnish') 865 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') 866 867 def test_title(self): 868 super().test_title() 869 self.assertEqual('\U0001044F'.title(), '\U00010427') 870 self.assertEqual('\U0001044F\U0001044F'.title(), 871 '\U00010427\U0001044F') 872 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(), 873 '\U00010427\U0001044F \U00010427\U0001044F') 874 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(), 875 '\U00010427\U0001044F \U00010427\U0001044F') 876 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(), 877 '\U00010427\U0001044F \U00010427\U0001044F') 878 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 879 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 880 self.assertEqual('fiNNISH'.title(), 'Finnish') 881 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') 882 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') 883 884 def test_swapcase(self): 885 string_tests.CommonTest.test_swapcase(self) 886 self.assertEqual('\U0001044F'.swapcase(), '\U00010427') 887 self.assertEqual('\U00010427'.swapcase(), '\U0001044F') 888 self.assertEqual('\U0001044F\U0001044F'.swapcase(), 889 '\U00010427\U00010427') 890 self.assertEqual('\U00010427\U0001044F'.swapcase(), 891 '\U0001044F\U00010427') 892 self.assertEqual('\U0001044F\U00010427'.swapcase(), 893 '\U00010427\U0001044F') 894 self.assertEqual('X\U00010427x\U0001044F'.swapcase(), 895 'x\U0001044FX\U00010427') 896 self.assertEqual('fi'.swapcase(), 'FI') 897 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307') 898 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 899 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 900 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3') 901 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 902 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A') 903 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 904 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399') 905 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ') 906 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 907 self.assertEqual('ß'.swapcase(), 'SS') 908 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') 909 910 def test_center(self): 911 string_tests.CommonTest.test_center(self) 912 self.assertEqual('x'.center(2, '\U0010FFFF'), 913 'x\U0010FFFF') 914 self.assertEqual('x'.center(3, '\U0010FFFF'), 915 '\U0010FFFFx\U0010FFFF') 916 self.assertEqual('x'.center(4, '\U0010FFFF'), 917 '\U0010FFFFx\U0010FFFF\U0010FFFF') 918 919 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system") 920 @support.cpython_only 921 def test_case_operation_overflow(self): 922 # Issue #22643 923 size = 2**32//12 + 1 924 try: 925 s = "ü" * size 926 except MemoryError: 927 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20)) 928 try: 929 self.assertRaises(OverflowError, s.upper) 930 finally: 931 del s 932 933 def test_contains(self): 934 # Testing Unicode contains method 935 self.assertIn('a', 'abdb') 936 self.assertIn('a', 'bdab') 937 self.assertIn('a', 'bdaba') 938 self.assertIn('a', 'bdba') 939 self.assertNotIn('a', 'bdb') 940 self.assertIn('a', 'bdba') 941 self.assertIn('a', ('a',1,None)) 942 self.assertIn('a', (1,None,'a')) 943 self.assertIn('a', ('a',1,None)) 944 self.assertIn('a', (1,None,'a')) 945 self.assertNotIn('a', ('x',1,'y')) 946 self.assertNotIn('a', ('x',1,None)) 947 self.assertNotIn('abcd', 'abcxxxx') 948 self.assertIn('ab', 'abcd') 949 self.assertIn('ab', 'abc') 950 self.assertIn('ab', (1,None,'ab')) 951 self.assertIn('', 'abc') 952 self.assertIn('', '') 953 self.assertIn('', 'abc') 954 self.assertNotIn('\0', 'abc') 955 self.assertIn('\0', '\0abc') 956 self.assertIn('\0', 'abc\0') 957 self.assertIn('a', '\0abc') 958 self.assertIn('asdf', 'asdf') 959 self.assertNotIn('asdf', 'asd') 960 self.assertNotIn('asdf', '') 961 962 self.assertRaises(TypeError, "abc".__contains__) 963 # test mixed kinds 964 for fill in ('a', '\u0100', '\U00010300'): 965 fill *= 9 966 for delim in ('c', '\u0102', '\U00010302'): 967 self.assertNotIn(delim, fill) 968 self.assertIn(delim, fill + delim) 969 self.assertNotIn(delim * 2, fill) 970 self.assertIn(delim * 2, fill + delim * 2) 971 972 def test_issue18183(self): 973 '\U00010000\U00100000'.lower() 974 '\U00010000\U00100000'.casefold() 975 '\U00010000\U00100000'.upper() 976 '\U00010000\U00100000'.capitalize() 977 '\U00010000\U00100000'.title() 978 '\U00010000\U00100000'.swapcase() 979 '\U00100000'.center(3, '\U00010000') 980 '\U00100000'.ljust(3, '\U00010000') 981 '\U00100000'.rjust(3, '\U00010000') 982 983 def test_format(self): 984 self.assertEqual(''.format(), '') 985 self.assertEqual('a'.format(), 'a') 986 self.assertEqual('ab'.format(), 'ab') 987 self.assertEqual('a{{'.format(), 'a{') 988 self.assertEqual('a}}'.format(), 'a}') 989 self.assertEqual('{{b'.format(), '{b') 990 self.assertEqual('}}b'.format(), '}b') 991 self.assertEqual('a{{b'.format(), 'a{b') 992 993 # examples from the PEP: 994 import datetime 995 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred") 996 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')), 997 "My name is Fred") 998 self.assertEqual("My name is {0} :-{{}}".format('Fred'), 999 "My name is Fred :-{}") 1000 1001 d = datetime.date(2007, 8, 18) 1002 self.assertEqual("The year is {0.year}".format(d), 1003 "The year is 2007") 1004 1005 # classes we'll use for testing 1006 class C: 1007 def __init__(self, x=100): 1008 self._x = x 1009 def __format__(self, spec): 1010 return spec 1011 1012 class D: 1013 def __init__(self, x): 1014 self.x = x 1015 def __format__(self, spec): 1016 return str(self.x) 1017 1018 # class with __str__, but no __format__ 1019 class E: 1020 def __init__(self, x): 1021 self.x = x 1022 def __str__(self): 1023 return 'E(' + self.x + ')' 1024 1025 # class with __repr__, but no __format__ or __str__ 1026 class F: 1027 def __init__(self, x): 1028 self.x = x 1029 def __repr__(self): 1030 return 'F(' + self.x + ')' 1031 1032 # class with __format__ that forwards to string, for some format_spec's 1033 class G: 1034 def __init__(self, x): 1035 self.x = x 1036 def __str__(self): 1037 return "string is " + self.x 1038 def __format__(self, format_spec): 1039 if format_spec == 'd': 1040 return 'G(' + self.x + ')' 1041 return object.__format__(self, format_spec) 1042 1043 class I(datetime.date): 1044 def __format__(self, format_spec): 1045 return self.strftime(format_spec) 1046 1047 class J(int): 1048 def __format__(self, format_spec): 1049 return int.__format__(self * 2, format_spec) 1050 1051 class M: 1052 def __init__(self, x): 1053 self.x = x 1054 def __repr__(self): 1055 return 'M(' + self.x + ')' 1056 __str__ = None 1057 1058 class N: 1059 def __init__(self, x): 1060 self.x = x 1061 def __repr__(self): 1062 return 'N(' + self.x + ')' 1063 __format__ = None 1064 1065 self.assertEqual(''.format(), '') 1066 self.assertEqual('abc'.format(), 'abc') 1067 self.assertEqual('{0}'.format('abc'), 'abc') 1068 self.assertEqual('{0:}'.format('abc'), 'abc') 1069# self.assertEqual('{ 0 }'.format('abc'), 'abc') 1070 self.assertEqual('X{0}'.format('abc'), 'Xabc') 1071 self.assertEqual('{0}X'.format('abc'), 'abcX') 1072 self.assertEqual('X{0}Y'.format('abc'), 'XabcY') 1073 self.assertEqual('{1}'.format(1, 'abc'), 'abc') 1074 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc') 1075 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX') 1076 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY') 1077 self.assertEqual('{0}'.format(-15), '-15') 1078 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc') 1079 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc') 1080 self.assertEqual('{{'.format(), '{') 1081 self.assertEqual('}}'.format(), '}') 1082 self.assertEqual('{{}}'.format(), '{}') 1083 self.assertEqual('{{x}}'.format(), '{x}') 1084 self.assertEqual('{{{0}}}'.format(123), '{123}') 1085 self.assertEqual('{{{{0}}}}'.format(), '{{0}}') 1086 self.assertEqual('}}{{'.format(), '}{') 1087 self.assertEqual('}}x{{'.format(), '}x{') 1088 1089 # weird field names 1090 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz') 1091 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz') 1092 self.assertEqual("{0[ ]}".format({' ':3}), '3') 1093 1094 self.assertEqual('{foo._x}'.format(foo=C(20)), '20') 1095 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010') 1096 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc') 1097 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc') 1098 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def') 1099 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def') 1100 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def') 1101 1102 # strings 1103 self.assertEqual('{0:.3s}'.format('abc'), 'abc') 1104 self.assertEqual('{0:.3s}'.format('ab'), 'ab') 1105 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc') 1106 self.assertEqual('{0:.0s}'.format('abcdef'), '') 1107 self.assertEqual('{0:3.3s}'.format('abc'), 'abc') 1108 self.assertEqual('{0:2.3s}'.format('abc'), 'abc') 1109 self.assertEqual('{0:2.2s}'.format('abc'), 'ab') 1110 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ') 1111 self.assertEqual('{0:x<0s}'.format('result'), 'result') 1112 self.assertEqual('{0:x<5s}'.format('result'), 'result') 1113 self.assertEqual('{0:x<6s}'.format('result'), 'result') 1114 self.assertEqual('{0:x<7s}'.format('result'), 'resultx') 1115 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx') 1116 self.assertEqual('{0: <7s}'.format('result'), 'result ') 1117 self.assertEqual('{0:<7s}'.format('result'), 'result ') 1118 self.assertEqual('{0:>7s}'.format('result'), ' result') 1119 self.assertEqual('{0:>8s}'.format('result'), ' result') 1120 self.assertEqual('{0:^8s}'.format('result'), ' result ') 1121 self.assertEqual('{0:^9s}'.format('result'), ' result ') 1122 self.assertEqual('{0:^10s}'.format('result'), ' result ') 1123 self.assertEqual('{0:8s}'.format('result'), 'result ') 1124 self.assertEqual('{0:0s}'.format('result'), 'result') 1125 self.assertEqual('{0:08s}'.format('result'), 'result00') 1126 self.assertEqual('{0:<08s}'.format('result'), 'result00') 1127 self.assertEqual('{0:>08s}'.format('result'), '00result') 1128 self.assertEqual('{0:^08s}'.format('result'), '0result0') 1129 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999) 1130 self.assertEqual('{0:10000}'.format(''), ' ' * 10000) 1131 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000) 1132 1133 # issue 12546: use \x00 as a fill character 1134 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00') 1135 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01') 1136 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00') 1137 self.assertEqual('{0:^6s}'.format('foo'), ' foo ') 1138 1139 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00') 1140 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01') 1141 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00') 1142 self.assertEqual('{0:<6}'.format(3), '3 ') 1143 1144 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00') 1145 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01') 1146 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00') 1147 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ') 1148 1149 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00') 1150 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01') 1151 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00') 1152 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ') 1153 1154 # format specifiers for user defined type 1155 self.assertEqual('{0:abc}'.format(C()), 'abc') 1156 1157 # !r, !s and !a coercions 1158 self.assertEqual('{0!s}'.format('Hello'), 'Hello') 1159 self.assertEqual('{0!s:}'.format('Hello'), 'Hello') 1160 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ') 1161 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ') 1162 self.assertEqual('{0!r}'.format('Hello'), "'Hello'") 1163 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'") 1164 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)') 1165 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable 1166 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable 1167 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)') 1168 self.assertEqual('{0!a}'.format('Hello'), "'Hello'") 1169 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable 1170 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable 1171 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'") 1172 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)') 1173 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)') 1174 1175 # test fallback to object.__format__ 1176 self.assertEqual('{0}'.format({}), '{}') 1177 self.assertEqual('{0}'.format([]), '[]') 1178 self.assertEqual('{0}'.format([1]), '[1]') 1179 1180 self.assertEqual('{0:d}'.format(G('data')), 'G(data)') 1181 self.assertEqual('{0!s}'.format(G('data')), 'string is data') 1182 1183 self.assertRaises(TypeError, '{0:^10}'.format, E('data')) 1184 self.assertRaises(TypeError, '{0:^10s}'.format, E('data')) 1185 self.assertRaises(TypeError, '{0:>15s}'.format, G('data')) 1186 1187 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007, 1188 month=8, 1189 day=27)), 1190 "date: 2007-08-27") 1191 1192 # test deriving from a builtin type and overriding __format__ 1193 self.assertEqual("{0}".format(J(10)), "20") 1194 1195 1196 # string format specifiers 1197 self.assertEqual('{0:}'.format('a'), 'a') 1198 1199 # computed format specifiers 1200 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello') 1201 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello') 1202 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello') 1203 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ') 1204 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ') 1205 1206 # test various errors 1207 self.assertRaises(ValueError, '{'.format) 1208 self.assertRaises(ValueError, '}'.format) 1209 self.assertRaises(ValueError, 'a{'.format) 1210 self.assertRaises(ValueError, 'a}'.format) 1211 self.assertRaises(ValueError, '{a'.format) 1212 self.assertRaises(ValueError, '}a'.format) 1213 self.assertRaises(IndexError, '{0}'.format) 1214 self.assertRaises(IndexError, '{1}'.format, 'abc') 1215 self.assertRaises(KeyError, '{x}'.format) 1216 self.assertRaises(ValueError, "}{".format) 1217 self.assertRaises(ValueError, "abc{0:{}".format) 1218 self.assertRaises(ValueError, "{0".format) 1219 self.assertRaises(IndexError, "{0.}".format) 1220 self.assertRaises(ValueError, "{0.}".format, 0) 1221 self.assertRaises(ValueError, "{0[}".format) 1222 self.assertRaises(ValueError, "{0[}".format, []) 1223 self.assertRaises(KeyError, "{0]}".format) 1224 self.assertRaises(ValueError, "{0.[]}".format, 0) 1225 self.assertRaises(ValueError, "{0..foo}".format, 0) 1226 self.assertRaises(ValueError, "{0[0}".format, 0) 1227 self.assertRaises(ValueError, "{0[0:foo}".format, 0) 1228 self.assertRaises(KeyError, "{c]}".format) 1229 self.assertRaises(ValueError, "{{ {{{0}}".format, 0) 1230 self.assertRaises(ValueError, "{0}}".format, 0) 1231 self.assertRaises(KeyError, "{foo}".format, bar=3) 1232 self.assertRaises(ValueError, "{0!x}".format, 3) 1233 self.assertRaises(ValueError, "{0!}".format, 0) 1234 self.assertRaises(ValueError, "{0!rs}".format, 0) 1235 self.assertRaises(ValueError, "{!}".format) 1236 self.assertRaises(IndexError, "{:}".format) 1237 self.assertRaises(IndexError, "{:s}".format) 1238 self.assertRaises(IndexError, "{}".format) 1239 big = "23098475029384702983476098230754973209482573" 1240 self.assertRaises(ValueError, ("{" + big + "}").format) 1241 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0]) 1242 1243 # issue 6089 1244 self.assertRaises(ValueError, "{0[0]x}".format, [None]) 1245 self.assertRaises(ValueError, "{0[0](10)}".format, [None]) 1246 1247 # can't have a replacement on the field name portion 1248 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4) 1249 1250 # exceed maximum recursion depth 1251 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '') 1252 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format, 1253 0, 1, 2, 3, 4, 5, 6, 7) 1254 1255 # string format spec errors 1256 sign_msg = "Sign not allowed in string format specifier" 1257 self.assertRaisesRegex(ValueError, sign_msg, "{0:-s}".format, '') 1258 self.assertRaisesRegex(ValueError, sign_msg, format, "", "-") 1259 space_msg = "Space not allowed in string format specifier" 1260 self.assertRaisesRegex(ValueError, space_msg, "{: }".format, '') 1261 self.assertRaises(ValueError, "{0:=s}".format, '') 1262 1263 # Alternate formatting is not supported 1264 self.assertRaises(ValueError, format, '', '#') 1265 self.assertRaises(ValueError, format, '', '#20') 1266 1267 # Non-ASCII 1268 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"), 1269 'ABC\u0410\u0411\u0412') 1270 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"), 1271 'ABC') 1272 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"), 1273 '') 1274 1275 self.assertEqual("{[{}]}".format({"{}": 5}), "5") 1276 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a") 1277 self.assertEqual("{[{]}".format({"{" : "a"}), "a") 1278 self.assertEqual("{[}]}".format({"}" : "a"}), "a") 1279 self.assertEqual("{[[]}".format({"[" : "a"}), "a") 1280 self.assertEqual("{[!]}".format({"!" : "a"}), "a") 1281 self.assertRaises(ValueError, "{a{}b}".format, 42) 1282 self.assertRaises(ValueError, "{a{b}".format, 42) 1283 self.assertRaises(ValueError, "{[}".format, 42) 1284 1285 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000") 1286 1287 # Blocking fallback 1288 m = M('data') 1289 self.assertEqual("{!r}".format(m), 'M(data)') 1290 self.assertRaises(TypeError, "{!s}".format, m) 1291 self.assertRaises(TypeError, "{}".format, m) 1292 n = N('data') 1293 self.assertEqual("{!r}".format(n), 'N(data)') 1294 self.assertEqual("{!s}".format(n), 'N(data)') 1295 self.assertRaises(TypeError, "{}".format, n) 1296 1297 def test_format_map(self): 1298 self.assertEqual(''.format_map({}), '') 1299 self.assertEqual('a'.format_map({}), 'a') 1300 self.assertEqual('ab'.format_map({}), 'ab') 1301 self.assertEqual('a{{'.format_map({}), 'a{') 1302 self.assertEqual('a}}'.format_map({}), 'a}') 1303 self.assertEqual('{{b'.format_map({}), '{b') 1304 self.assertEqual('}}b'.format_map({}), '}b') 1305 self.assertEqual('a{{b'.format_map({}), 'a{b') 1306 1307 # using mappings 1308 class Mapping(dict): 1309 def __missing__(self, key): 1310 return key 1311 self.assertEqual('{hello}'.format_map(Mapping()), 'hello') 1312 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world') 1313 1314 class InternalMapping: 1315 def __init__(self): 1316 self.mapping = {'a': 'hello'} 1317 def __getitem__(self, key): 1318 return self.mapping[key] 1319 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello') 1320 1321 1322 class C: 1323 def __init__(self, x=100): 1324 self._x = x 1325 def __format__(self, spec): 1326 return spec 1327 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20') 1328 1329 # test various errors 1330 self.assertRaises(TypeError, ''.format_map) 1331 self.assertRaises(TypeError, 'a'.format_map) 1332 1333 self.assertRaises(ValueError, '{'.format_map, {}) 1334 self.assertRaises(ValueError, '}'.format_map, {}) 1335 self.assertRaises(ValueError, 'a{'.format_map, {}) 1336 self.assertRaises(ValueError, 'a}'.format_map, {}) 1337 self.assertRaises(ValueError, '{a'.format_map, {}) 1338 self.assertRaises(ValueError, '}a'.format_map, {}) 1339 1340 # issue #12579: can't supply positional params to format_map 1341 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2}) 1342 self.assertRaises(ValueError, '{}'.format_map, 'a') 1343 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1}) 1344 1345 class BadMapping: 1346 def __getitem__(self, key): 1347 return 1/0 1348 self.assertRaises(KeyError, '{a}'.format_map, {}) 1349 self.assertRaises(TypeError, '{a}'.format_map, []) 1350 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping()) 1351 1352 def test_format_huge_precision(self): 1353 format_string = ".{}f".format(sys.maxsize + 1) 1354 with self.assertRaises(ValueError): 1355 result = format(2.34, format_string) 1356 1357 def test_format_huge_width(self): 1358 format_string = "{}f".format(sys.maxsize + 1) 1359 with self.assertRaises(ValueError): 1360 result = format(2.34, format_string) 1361 1362 def test_format_huge_item_number(self): 1363 format_string = "{{{}:.6f}}".format(sys.maxsize + 1) 1364 with self.assertRaises(ValueError): 1365 result = format_string.format(2.34) 1366 1367 def test_format_auto_numbering(self): 1368 class C: 1369 def __init__(self, x=100): 1370 self._x = x 1371 def __format__(self, spec): 1372 return spec 1373 1374 self.assertEqual('{}'.format(10), '10') 1375 self.assertEqual('{:5}'.format('s'), 's ') 1376 self.assertEqual('{!r}'.format('s'), "'s'") 1377 self.assertEqual('{._x}'.format(C(10)), '10') 1378 self.assertEqual('{[1]}'.format([1, 2]), '2') 1379 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4') 1380 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c') 1381 1382 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b') 1383 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b') 1384 1385 # can't mix and match numbering and auto-numbering 1386 self.assertRaises(ValueError, '{}{1}'.format, 1, 2) 1387 self.assertRaises(ValueError, '{1}{}'.format, 1, 2) 1388 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2) 1389 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2) 1390 1391 # can mix and match auto-numbering and named 1392 self.assertEqual('{f}{}'.format(4, f='test'), 'test4') 1393 self.assertEqual('{}{f}'.format(4, f='test'), '4test') 1394 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3') 1395 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g') 1396 1397 def test_formatting(self): 1398 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) 1399 # Testing Unicode formatting strings... 1400 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc') 1401 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00') 1402 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00') 1403 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50') 1404 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57') 1405 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57') 1406 if not sys.platform.startswith('java'): 1407 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'") 1408 self.assertEqual("%r" % ("\u1234",), "'\u1234'") 1409 self.assertEqual("%a" % ("\u1234",), "'\\u1234'") 1410 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def') 1411 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def') 1412 1413 self.assertEqual('%c' % 0x1234, '\u1234') 1414 self.assertEqual('%c' % 0x21483, '\U00021483') 1415 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) 1416 self.assertEqual('%c' % '\U00021483', '\U00021483') 1417 self.assertRaises(TypeError, "%c".__mod__, "aa") 1418 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3)) 1419 self.assertRaises(TypeError, "%i".__mod__, "aa") 1420 1421 # formatting jobs delegated from the string implementation: 1422 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1423 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1424 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1425 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1426 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1427 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1428 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...') 1429 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...') 1430 self.assertEqual('...%s...' % "abc", '...abc...') 1431 self.assertEqual('%*s' % (5,'abc',), ' abc') 1432 self.assertEqual('%*s' % (-5,'abc',), 'abc ') 1433 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab') 1434 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc') 1435 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc') 1436 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc') 1437 self.assertEqual('%c' % 'a', 'a') 1438 class Wrapper: 1439 def __str__(self): 1440 return '\u1234' 1441 self.assertEqual('%s' % Wrapper(), '\u1234') 1442 1443 # issue 3382 1444 NAN = float('nan') 1445 INF = float('inf') 1446 self.assertEqual('%f' % NAN, 'nan') 1447 self.assertEqual('%F' % NAN, 'NAN') 1448 self.assertEqual('%f' % INF, 'inf') 1449 self.assertEqual('%F' % INF, 'INF') 1450 1451 # PEP 393 1452 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') 1453 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') 1454 1455 #issue 19995 1456 class PseudoInt: 1457 def __init__(self, value): 1458 self.value = int(value) 1459 def __int__(self): 1460 return self.value 1461 def __index__(self): 1462 return self.value 1463 class PseudoFloat: 1464 def __init__(self, value): 1465 self.value = float(value) 1466 def __int__(self): 1467 return int(self.value) 1468 pi = PseudoFloat(3.1415) 1469 letter_m = PseudoInt(109) 1470 self.assertEqual('%x' % 42, '2a') 1471 self.assertEqual('%X' % 15, 'F') 1472 self.assertEqual('%o' % 9, '11') 1473 self.assertEqual('%c' % 109, 'm') 1474 self.assertEqual('%x' % letter_m, '6d') 1475 self.assertEqual('%X' % letter_m, '6D') 1476 self.assertEqual('%o' % letter_m, '155') 1477 self.assertEqual('%c' % letter_m, 'm') 1478 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14), 1479 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11), 1480 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79), 1481 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi), 1482 self.assertRaises(TypeError, operator.mod, '%c', pi), 1483 1484 def test_formatting_with_enum(self): 1485 # issue18780 1486 import enum 1487 class Float(float, enum.Enum): 1488 PI = 3.1415926 1489 class Int(enum.IntEnum): 1490 IDES = 15 1491 class Str(str, enum.Enum): 1492 ABC = 'abc' 1493 # Testing Unicode formatting strings... 1494 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC), 1495 'Str.ABC, Str.ABC') 1496 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" % 1497 (Str.ABC, Str.ABC, 1498 Int.IDES, Int.IDES, Int.IDES, 1499 Float.PI, Float.PI), 1500 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14') 1501 1502 # formatting jobs delegated from the string implementation: 1503 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC}, 1504 '...Str.ABC...') 1505 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES}, 1506 '...Int.IDES...') 1507 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES}, 1508 '...15...') 1509 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES}, 1510 '...15...') 1511 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI}, 1512 '...15...') 1513 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123}, 1514 '...3.141593...') 1515 1516 def test_formatting_huge_precision(self): 1517 format_string = "%.{}f".format(sys.maxsize + 1) 1518 with self.assertRaises(ValueError): 1519 result = format_string % 2.34 1520 1521 def test_issue28598_strsubclass_rhs(self): 1522 # A subclass of str with an __rmod__ method should be able to hook 1523 # into the % operator 1524 class SubclassedStr(str): 1525 def __rmod__(self, other): 1526 return 'Success, self.__rmod__({!r}) was called'.format(other) 1527 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'), 1528 "Success, self.__rmod__('lhs %% %r') was called") 1529 1530 @support.cpython_only 1531 def test_formatting_huge_precision_c_limits(self): 1532 from _testcapi import INT_MAX 1533 format_string = "%.{}f".format(INT_MAX + 1) 1534 with self.assertRaises(ValueError): 1535 result = format_string % 2.34 1536 1537 def test_formatting_huge_width(self): 1538 format_string = "%{}f".format(sys.maxsize + 1) 1539 with self.assertRaises(ValueError): 1540 result = format_string % 2.34 1541 1542 def test_startswith_endswith_errors(self): 1543 for meth in ('foo'.startswith, 'foo'.endswith): 1544 with self.assertRaises(TypeError) as cm: 1545 meth(['f']) 1546 exc = str(cm.exception) 1547 self.assertIn('str', exc) 1548 self.assertIn('tuple', exc) 1549 1550 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR') 1551 def test_format_float(self): 1552 # should not format with a comma, but always with C locale 1553 self.assertEqual('1.0', '%.1f' % 1.0) 1554 1555 def test_constructor(self): 1556 # unicode(obj) tests (this maps to PyObject_Unicode() at C level) 1557 1558 self.assertEqual( 1559 str('unicode remains unicode'), 1560 'unicode remains unicode' 1561 ) 1562 1563 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): 1564 subclass = StrSubclass(text) 1565 self.assertEqual(str(subclass), text) 1566 self.assertEqual(len(subclass), len(text)) 1567 if text == 'ascii': 1568 self.assertEqual(subclass.encode('ascii'), b'ascii') 1569 self.assertEqual(subclass.encode('utf-8'), b'ascii') 1570 1571 self.assertEqual( 1572 str('strings are converted to unicode'), 1573 'strings are converted to unicode' 1574 ) 1575 1576 class StringCompat: 1577 def __init__(self, x): 1578 self.x = x 1579 def __str__(self): 1580 return self.x 1581 1582 self.assertEqual( 1583 str(StringCompat('__str__ compatible objects are recognized')), 1584 '__str__ compatible objects are recognized' 1585 ) 1586 1587 # unicode(obj) is compatible to str(): 1588 1589 o = StringCompat('unicode(obj) is compatible to str()') 1590 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1591 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1592 1593 for obj in (123, 123.45, 123): 1594 self.assertEqual(str(obj), str(str(obj))) 1595 1596 # unicode(obj, encoding, error) tests (this maps to 1597 # PyUnicode_FromEncodedObject() at C level) 1598 1599 if not sys.platform.startswith('java'): 1600 self.assertRaises( 1601 TypeError, 1602 str, 1603 'decoding unicode is not supported', 1604 'utf-8', 1605 'strict' 1606 ) 1607 1608 self.assertEqual( 1609 str(b'strings are decoded to unicode', 'utf-8', 'strict'), 1610 'strings are decoded to unicode' 1611 ) 1612 1613 if not sys.platform.startswith('java'): 1614 self.assertEqual( 1615 str( 1616 memoryview(b'character buffers are decoded to unicode'), 1617 'utf-8', 1618 'strict' 1619 ), 1620 'character buffers are decoded to unicode' 1621 ) 1622 1623 self.assertRaises(TypeError, str, 42, 42, 42) 1624 1625 def test_constructor_keyword_args(self): 1626 """Pass various keyword argument combinations to the constructor.""" 1627 # The object argument can be passed as a keyword. 1628 self.assertEqual(str(object='foo'), 'foo') 1629 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo') 1630 # The errors argument without encoding triggers "decode" mode. 1631 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'" 1632 self.assertEqual(str(object=b'foo', errors='strict'), 'foo') 1633 1634 def test_constructor_defaults(self): 1635 """Check the constructor argument defaults.""" 1636 # The object argument defaults to '' or b''. 1637 self.assertEqual(str(), '') 1638 self.assertEqual(str(errors='strict'), '') 1639 utf8_cent = '¢'.encode('utf-8') 1640 # The encoding argument defaults to utf-8. 1641 self.assertEqual(str(utf8_cent, errors='strict'), '¢') 1642 # The errors argument defaults to strict. 1643 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii') 1644 1645 def test_codecs_utf7(self): 1646 utfTests = [ 1647 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example 1648 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example 1649 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example 1650 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example 1651 ('+', b'+-'), 1652 ('+-', b'+--'), 1653 ('+?', b'+-?'), 1654 (r'\?', b'+AFw?'), 1655 ('+?', b'+-?'), 1656 (r'\\?', b'+AFwAXA?'), 1657 (r'\\\?', b'+AFwAXABc?'), 1658 (r'++--', b'+-+---'), 1659 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs 1660 ('/', b'/'), 1661 ] 1662 1663 for (x, y) in utfTests: 1664 self.assertEqual(x.encode('utf-7'), y) 1665 1666 # Unpaired surrogates are passed through 1667 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-') 1668 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x') 1669 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-') 1670 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x') 1671 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801') 1672 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x') 1673 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01') 1674 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x') 1675 1676 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-') 1677 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') 1678 1679 # Issue #2242: crash on some Windows/MSVC versions 1680 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '') 1681 1682 # Direct encoded characters 1683 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" 1684 # Optional direct characters 1685 set_o = '!"#$%&*;<=>@[]^_`{|}' 1686 for c in set_d: 1687 self.assertEqual(c.encode('utf7'), c.encode('ascii')) 1688 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1689 for c in set_o: 1690 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1691 1692 with self.assertRaisesRegex(UnicodeDecodeError, 1693 'ill-formed sequence'): 1694 b'+@'.decode('utf-7') 1695 1696 def test_codecs_utf8(self): 1697 self.assertEqual(''.encode('utf-8'), b'') 1698 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') 1699 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') 1700 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') 1701 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') 1702 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') 1703 self.assertEqual(('\U00010002'*10).encode('utf-8'), 1704 b'\xf0\x90\x80\x82'*10) 1705 self.assertEqual( 1706 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' 1707 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' 1708 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' 1709 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' 1710 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' 1711 ' Nunstuck git und'.encode('utf-8'), 1712 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' 1713 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' 1714 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' 1715 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' 1716 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' 1717 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' 1718 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' 1719 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' 1720 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' 1721 b'\xe3\x80\x8cWenn ist das Nunstuck git und' 1722 ) 1723 1724 # UTF-8 specific decoding tests 1725 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' ) 1726 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' ) 1727 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' ) 1728 1729 # Other possible utf-8 test cases: 1730 # * strict decoding testing for all of the 1731 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 1732 1733 def test_utf8_decode_valid_sequences(self): 1734 sequences = [ 1735 # single byte 1736 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'), 1737 # 2 bytes 1738 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'), 1739 # 3 bytes 1740 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'), 1741 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'), 1742 # 4 bytes 1743 (b'\xF0\x90\x80\x80', '\U00010000'), 1744 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF') 1745 ] 1746 for seq, res in sequences: 1747 self.assertEqual(seq.decode('utf-8'), res) 1748 1749 1750 def test_utf8_decode_invalid_sequences(self): 1751 # continuation bytes in a sequence of 2, 3, or 4 bytes 1752 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] 1753 # start bytes of a 2-byte sequence equivalent to code points < 0x7F 1754 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] 1755 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF 1756 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] 1757 invalid_start_bytes = ( 1758 continuation_bytes + invalid_2B_seq_start_bytes + 1759 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] 1760 ) 1761 1762 for byte in invalid_start_bytes: 1763 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8') 1764 1765 for sb in invalid_2B_seq_start_bytes: 1766 for cb in continuation_bytes: 1767 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8') 1768 1769 for sb in invalid_4B_seq_start_bytes: 1770 for cb1 in continuation_bytes[:3]: 1771 for cb3 in continuation_bytes[:3]: 1772 self.assertRaises(UnicodeDecodeError, 1773 (sb+cb1+b'\x80'+cb3).decode, 'utf-8') 1774 1775 for cb in [bytes([x]) for x in range(0x80, 0xA0)]: 1776 self.assertRaises(UnicodeDecodeError, 1777 (b'\xE0'+cb+b'\x80').decode, 'utf-8') 1778 self.assertRaises(UnicodeDecodeError, 1779 (b'\xE0'+cb+b'\xBF').decode, 'utf-8') 1780 # surrogates 1781 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: 1782 self.assertRaises(UnicodeDecodeError, 1783 (b'\xED'+cb+b'\x80').decode, 'utf-8') 1784 self.assertRaises(UnicodeDecodeError, 1785 (b'\xED'+cb+b'\xBF').decode, 'utf-8') 1786 for cb in [bytes([x]) for x in range(0x80, 0x90)]: 1787 self.assertRaises(UnicodeDecodeError, 1788 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8') 1789 self.assertRaises(UnicodeDecodeError, 1790 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8') 1791 for cb in [bytes([x]) for x in range(0x90, 0xC0)]: 1792 self.assertRaises(UnicodeDecodeError, 1793 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8') 1794 self.assertRaises(UnicodeDecodeError, 1795 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8') 1796 1797 def test_issue8271(self): 1798 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, 1799 # only the start byte and the continuation byte(s) are now considered 1800 # invalid, instead of the number of bytes specified by the start byte. 1801 # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, 1802 # table 3-8, Row 2) for more information about the algorithm used. 1803 FFFD = '\ufffd' 1804 sequences = [ 1805 # invalid start bytes 1806 (b'\x80', FFFD), # continuation byte 1807 (b'\x80\x80', FFFD*2), # 2 continuation bytes 1808 (b'\xc0', FFFD), 1809 (b'\xc0\xc0', FFFD*2), 1810 (b'\xc1', FFFD), 1811 (b'\xc1\xc0', FFFD*2), 1812 (b'\xc0\xc1', FFFD*2), 1813 # with start byte of a 2-byte sequence 1814 (b'\xc2', FFFD), # only the start byte 1815 (b'\xc2\xc2', FFFD*2), # 2 start bytes 1816 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes 1817 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte 1818 # with start byte of a 3-byte sequence 1819 (b'\xe1', FFFD), # only the start byte 1820 (b'\xe1\xe1', FFFD*2), # 2 start bytes 1821 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes 1822 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes 1823 (b'\xe1\x80', FFFD), # only 1 continuation byte 1824 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte 1825 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb 1826 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes 1827 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte 1828 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid 1829 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid 1830 # with start byte of a 4-byte sequence 1831 (b'\xf1', FFFD), # only the start byte 1832 (b'\xf1\xf1', FFFD*2), # 2 start bytes 1833 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes 1834 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes 1835 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes 1836 (b'\xf1\x80', FFFD), # only 1 continuation bytes 1837 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes 1838 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid 1839 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid 1840 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid 1841 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid 1842 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid 1843 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid 1844 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid 1845 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD), 1846 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2), 1847 (b'\xf1\xf1\x80\x41', FFFD*2+'A'), 1848 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2), 1849 # with invalid start byte of a 4-byte sequence (rfc2279) 1850 (b'\xf5', FFFD), # only the start byte 1851 (b'\xf5\xf5', FFFD*2), # 2 start bytes 1852 (b'\xf5\x80', FFFD*2), # only 1 continuation byte 1853 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte 1854 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes 1855 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid 1856 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD), 1857 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'), 1858 # with invalid start byte of a 5-byte sequence (rfc2279) 1859 (b'\xf8', FFFD), # only the start byte 1860 (b'\xf8\xf8', FFFD*2), # 2 start bytes 1861 (b'\xf8\x80', FFFD*2), # only one continuation byte 1862 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid 1863 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes 1864 # with invalid start byte of a 6-byte sequence (rfc2279) 1865 (b'\xfc', FFFD), # only the start byte 1866 (b'\xfc\xfc', FFFD*2), # 2 start bytes 1867 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes 1868 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes 1869 # invalid start byte 1870 (b'\xfe', FFFD), 1871 (b'\xfe\x80\x80', FFFD*3), 1872 # other sequences 1873 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'), 1874 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'), 1875 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'), 1876 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', 1877 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), 1878 ] 1879 for n, (seq, res) in enumerate(sequences): 1880 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict') 1881 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1882 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b') 1883 self.assertEqual(seq.decode('utf-8', 'ignore'), 1884 res.replace('\uFFFD', '')) 1885 1886 def assertCorrectUTF8Decoding(self, seq, res, err): 1887 """ 1888 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when 1889 'strict' is used, returns res when 'replace' is used, and that doesn't 1890 return anything when 'ignore' is used. 1891 """ 1892 with self.assertRaises(UnicodeDecodeError) as cm: 1893 seq.decode('utf-8') 1894 exc = cm.exception 1895 1896 self.assertIn(err, str(exc)) 1897 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1898 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'), 1899 'aaaa' + res + 'bbbb') 1900 res = res.replace('\ufffd', '') 1901 self.assertEqual(seq.decode('utf-8', 'ignore'), res) 1902 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'), 1903 'aaaa' + res + 'bbbb') 1904 1905 def test_invalid_start_byte(self): 1906 """ 1907 Test that an 'invalid start byte' error is raised when the first byte 1908 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or 1909 4-bytes sequence. The invalid start byte is replaced with a single 1910 U+FFFD when errors='replace'. 1911 E.g. <80> is a continuation byte and can appear only after a start byte. 1912 """ 1913 FFFD = '\ufffd' 1914 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': 1915 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd', 1916 'invalid start byte') 1917 1918 def test_unexpected_end_of_data(self): 1919 """ 1920 Test that an 'unexpected end of data' error is raised when the string 1921 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having 1922 enough continuation bytes. The incomplete sequence is replaced with a 1923 single U+FFFD when errors='replace'. 1924 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes 1925 sequence, but it's followed by only 2 valid continuation bytes and the 1926 last continuation bytes is missing. 1927 Note: the continuation bytes must be all valid, if one of them is 1928 invalid another error will be raised. 1929 """ 1930 sequences = [ 1931 'C2', 'DF', 1932 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF', 1933 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF', 1934 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF', 1935 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF', 1936 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF', 1937 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' 1938 ] 1939 FFFD = '\ufffd' 1940 for seq in sequences: 1941 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd', 1942 'unexpected end of data') 1943 1944 def test_invalid_cb_for_2bytes_seq(self): 1945 """ 1946 Test that an 'invalid continuation byte' error is raised when the 1947 continuation byte of a 2-bytes sequence is invalid. The start byte 1948 is replaced by a single U+FFFD and the second byte is handled 1949 separately when errors='replace'. 1950 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes 1951 sequence, but 41 is not a valid continuation byte because it's the 1952 ASCII letter 'A'. 1953 """ 1954 FFFD = '\ufffd' 1955 FFFDx2 = FFFD * 2 1956 sequences = [ 1957 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'), 1958 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2), 1959 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'), 1960 ('DF C0', FFFDx2), ('DF FF', FFFDx2), 1961 ] 1962 for seq, res in sequences: 1963 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 1964 'invalid continuation byte') 1965 1966 def test_invalid_cb_for_3bytes_seq(self): 1967 """ 1968 Test that an 'invalid continuation byte' error is raised when the 1969 continuation byte(s) of a 3-bytes sequence are invalid. When 1970 errors='replace', if the first continuation byte is valid, the first 1971 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the 1972 third byte is handled separately, otherwise only the start byte is 1973 replaced with a U+FFFD and the other continuation bytes are handled 1974 separately. 1975 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 1976 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 1977 because it's the ASCII letter 'A'. 1978 Note: when the start byte is E0 or ED, the valid ranges for the first 1979 continuation byte are limited to A0..BF and 80..9F respectively. 1980 Python 2 used to consider all the bytes in range 80..BF valid when the 1981 start byte was ED. This is fixed in Python 3. 1982 """ 1983 FFFD = '\ufffd' 1984 FFFDx2 = FFFD * 2 1985 sequences = [ 1986 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2), 1987 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2), 1988 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'), 1989 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2), 1990 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'), 1991 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'), 1992 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2), 1993 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'), 1994 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2), 1995 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'), 1996 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'), 1997 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2), 1998 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'), 1999 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2), 2000 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'), 2001 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'), 2002 ('ED 7F', FFFD+'\x7f'), 2003 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^ 2004 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'), 2005 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2), 2006 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'), 2007 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2), 2008 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'), 2009 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2), 2010 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'), 2011 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2), 2012 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'), 2013 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'), 2014 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2), 2015 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'), 2016 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2), 2017 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'), 2018 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2), 2019 ] 2020 for seq, res in sequences: 2021 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 2022 'invalid continuation byte') 2023 2024 def test_invalid_cb_for_4bytes_seq(self): 2025 """ 2026 Test that an 'invalid continuation byte' error is raised when the 2027 continuation byte(s) of a 4-bytes sequence are invalid. When 2028 errors='replace',the start byte and all the following valid 2029 continuation bytes are replaced with a single U+FFFD, and all the bytes 2030 starting from the first invalid continuation bytes (included) are 2031 handled separately. 2032 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 2033 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 2034 because it's the ASCII letter 'A'. 2035 Note: when the start byte is E0 or ED, the valid ranges for the first 2036 continuation byte are limited to A0..BF and 80..9F respectively. 2037 However, when the start byte is ED, Python 2 considers all the bytes 2038 in range 80..BF valid. This is fixed in Python 3. 2039 """ 2040 FFFD = '\ufffd' 2041 FFFDx2 = FFFD * 2 2042 sequences = [ 2043 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2), 2044 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2), 2045 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'), 2046 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2), 2047 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'), 2048 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2), 2049 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'), 2050 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2), 2051 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'), 2052 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2), 2053 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'), 2054 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2), 2055 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'), 2056 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2), 2057 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2), 2058 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'), 2059 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2), 2060 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'), 2061 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2), 2062 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'), 2063 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2), 2064 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'), 2065 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2), 2066 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'), 2067 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2), 2068 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'), 2069 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2), 2070 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'), 2071 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2), 2072 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'), 2073 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2), 2074 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'), 2075 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2), 2076 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'), 2077 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2), 2078 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'), 2079 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2), 2080 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'), 2081 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2), 2082 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'), 2083 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2), 2084 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2), 2085 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2), 2086 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'), 2087 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2), 2088 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'), 2089 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2), 2090 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'), 2091 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2), 2092 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'), 2093 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2), 2094 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'), 2095 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2), 2096 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'), 2097 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2) 2098 ] 2099 for seq, res in sequences: 2100 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 2101 'invalid continuation byte') 2102 2103 def test_codecs_idna(self): 2104 # Test whether trailing dot is preserved 2105 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.") 2106 2107 def test_codecs_errors(self): 2108 # Error handling (encoding) 2109 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii') 2110 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict') 2111 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x") 2112 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x") 2113 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'), 2114 'Andr\202 x'.encode('ascii', errors='replace')) 2115 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'), 2116 'Andr\202 x'.encode(encoding='ascii', errors='ignore')) 2117 2118 # Error handling (decoding) 2119 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii') 2120 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') 2121 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") 2122 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') 2123 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x') 2124 2125 # Error handling (unknown character names) 2126 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") 2127 2128 # Error handling (truncated escape sequence) 2129 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape") 2130 2131 self.assertRaises(TypeError, b"hello".decode, "test.unicode1") 2132 self.assertRaises(TypeError, str, b"hello", "test.unicode2") 2133 self.assertRaises(TypeError, "hello".encode, "test.unicode1") 2134 self.assertRaises(TypeError, "hello".encode, "test.unicode2") 2135 2136 # Error handling (wrong arguments) 2137 self.assertRaises(TypeError, "hello".encode, 42, 42, 42) 2138 2139 # Error handling (lone surrogate in 2140 # _PyUnicode_TransformDecimalAndSpaceToASCII()) 2141 self.assertRaises(ValueError, int, "\ud800") 2142 self.assertRaises(ValueError, int, "\udf00") 2143 self.assertRaises(ValueError, float, "\ud800") 2144 self.assertRaises(ValueError, float, "\udf00") 2145 self.assertRaises(ValueError, complex, "\ud800") 2146 self.assertRaises(ValueError, complex, "\udf00") 2147 2148 def test_codecs(self): 2149 # Encoding 2150 self.assertEqual('hello'.encode('ascii'), b'hello') 2151 self.assertEqual('hello'.encode('utf-7'), b'hello') 2152 self.assertEqual('hello'.encode('utf-8'), b'hello') 2153 self.assertEqual('hello'.encode('utf-8'), b'hello') 2154 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000') 2155 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o') 2156 self.assertEqual('hello'.encode('latin-1'), b'hello') 2157 2158 # Default encoding is utf-8 2159 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83') 2160 2161 # Roundtrip safety for BMP (just the first 1024 chars) 2162 for c in range(1024): 2163 u = chr(c) 2164 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 2165 'utf-16-be', 'raw_unicode_escape', 2166 'unicode_escape'): 2167 self.assertEqual(str(u.encode(encoding),encoding), u) 2168 2169 # Roundtrip safety for BMP (just the first 256 chars) 2170 for c in range(256): 2171 u = chr(c) 2172 for encoding in ('latin-1',): 2173 self.assertEqual(str(u.encode(encoding),encoding), u) 2174 2175 # Roundtrip safety for BMP (just the first 128 chars) 2176 for c in range(128): 2177 u = chr(c) 2178 for encoding in ('ascii',): 2179 self.assertEqual(str(u.encode(encoding),encoding), u) 2180 2181 # Roundtrip safety for non-BMP (just a few chars) 2182 with warnings.catch_warnings(): 2183 u = '\U00010001\U00020002\U00030003\U00040004\U00050005' 2184 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 2185 'raw_unicode_escape', 'unicode_escape'): 2186 self.assertEqual(str(u.encode(encoding),encoding), u) 2187 2188 # UTF-8 must be roundtrip safe for all code points 2189 # (except surrogates, which are forbidden). 2190 u = ''.join(map(chr, list(range(0, 0xd800)) + 2191 list(range(0xe000, 0x110000)))) 2192 for encoding in ('utf-8',): 2193 self.assertEqual(str(u.encode(encoding),encoding), u) 2194 2195 def test_codecs_charmap(self): 2196 # 0-127 2197 s = bytes(range(128)) 2198 for encoding in ( 2199 'cp037', 'cp1026', 'cp273', 2200 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2201 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2202 'cp863', 'cp865', 'cp866', 'cp1125', 2203 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2204 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 2205 'iso8859_7', 'iso8859_9', 2206 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1', 2207 'mac_cyrillic', 'mac_latin2', 2208 2209 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2210 'cp1256', 'cp1257', 'cp1258', 2211 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2212 2213 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2214 'cp1006', 'iso8859_8', 2215 2216 ### These have undefined mappings: 2217 #'cp424', 2218 2219 ### These fail the round-trip: 2220 #'cp875' 2221 2222 ): 2223 self.assertEqual(str(s, encoding).encode(encoding), s) 2224 2225 # 128-255 2226 s = bytes(range(128, 256)) 2227 for encoding in ( 2228 'cp037', 'cp1026', 'cp273', 2229 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2230 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2231 'cp863', 'cp865', 'cp866', 'cp1125', 2232 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2233 'iso8859_2', 'iso8859_4', 'iso8859_5', 2234 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1', 2235 'mac_cyrillic', 'mac_latin2', 2236 2237 ### These have undefined mappings: 2238 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2239 #'cp1256', 'cp1257', 'cp1258', 2240 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2241 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048', 2242 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2243 2244 ### These fail the round-trip: 2245 #'cp1006', 'cp875', 'iso8859_8', 2246 2247 ): 2248 self.assertEqual(str(s, encoding).encode(encoding), s) 2249 2250 def test_concatenation(self): 2251 self.assertEqual(("abc" "def"), "abcdef") 2252 self.assertEqual(("abc" "def"), "abcdef") 2253 self.assertEqual(("abc" "def"), "abcdef") 2254 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2255 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2256 2257 def test_ucs4(self): 2258 x = '\U00100000' 2259 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") 2260 self.assertEqual(x, y) 2261 2262 y = br'\U00100000' 2263 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2264 self.assertEqual(x, y) 2265 y = br'\U00010000' 2266 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2267 self.assertEqual(x, y) 2268 2269 try: 2270 br'\U11111111'.decode("raw-unicode-escape") 2271 except UnicodeDecodeError as e: 2272 self.assertEqual(e.start, 0) 2273 self.assertEqual(e.end, 10) 2274 else: 2275 self.fail("Should have raised UnicodeDecodeError") 2276 2277 def test_conversion(self): 2278 # Make sure __str__() works properly 2279 class ObjectToStr: 2280 def __str__(self): 2281 return "foo" 2282 2283 class StrSubclassToStr(str): 2284 def __str__(self): 2285 return "foo" 2286 2287 class StrSubclassToStrSubclass(str): 2288 def __new__(cls, content=""): 2289 return str.__new__(cls, 2*content) 2290 def __str__(self): 2291 return self 2292 2293 self.assertEqual(str(ObjectToStr()), "foo") 2294 self.assertEqual(str(StrSubclassToStr("bar")), "foo") 2295 s = str(StrSubclassToStrSubclass("foo")) 2296 self.assertEqual(s, "foofoo") 2297 self.assertIs(type(s), StrSubclassToStrSubclass) 2298 s = StrSubclass(StrSubclassToStrSubclass("foo")) 2299 self.assertEqual(s, "foofoo") 2300 self.assertIs(type(s), StrSubclass) 2301 2302 def test_unicode_repr(self): 2303 class s1: 2304 def __repr__(self): 2305 return '\\n' 2306 2307 class s2: 2308 def __repr__(self): 2309 return '\\n' 2310 2311 self.assertEqual(repr(s1()), '\\n') 2312 self.assertEqual(repr(s2()), '\\n') 2313 2314 def test_printable_repr(self): 2315 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable 2316 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable 2317 2318 # This test only affects 32-bit platforms because expandtabs can only take 2319 # an int as the max value, not a 64-bit C long. If expandtabs is changed 2320 # to take a 64-bit long, this test should apply to all platforms. 2321 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4, 2322 'only applies to 32-bit platforms') 2323 def test_expandtabs_overflows_gracefully(self): 2324 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize) 2325 2326 @support.cpython_only 2327 def test_expandtabs_optimization(self): 2328 s = 'abc' 2329 self.assertIs(s.expandtabs(), s) 2330 2331 def test_raiseMemError(self): 2332 if struct.calcsize('P') == 8: 2333 # 64 bits pointers 2334 ascii_struct_size = 48 2335 compact_struct_size = 72 2336 else: 2337 # 32 bits pointers 2338 ascii_struct_size = 24 2339 compact_struct_size = 36 2340 2341 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): 2342 code = ord(char) 2343 if code < 0x100: 2344 char_size = 1 # sizeof(Py_UCS1) 2345 struct_size = ascii_struct_size 2346 elif code < 0x10000: 2347 char_size = 2 # sizeof(Py_UCS2) 2348 struct_size = compact_struct_size 2349 else: 2350 char_size = 4 # sizeof(Py_UCS4) 2351 struct_size = compact_struct_size 2352 # Note: sys.maxsize is half of the actual max allocation because of 2353 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle 2354 # be allocatable, given enough memory. 2355 maxlen = ((sys.maxsize - struct_size) // char_size) 2356 alloc = lambda: char * maxlen 2357 self.assertRaises(MemoryError, alloc) 2358 self.assertRaises(MemoryError, alloc) 2359 2360 def test_format_subclass(self): 2361 class S(str): 2362 def __str__(self): 2363 return '__str__ overridden' 2364 s = S('xxx') 2365 self.assertEqual("%s" % s, '__str__ overridden') 2366 self.assertEqual("{}".format(s), '__str__ overridden') 2367 2368 def test_subclass_add(self): 2369 class S(str): 2370 def __add__(self, o): 2371 return "3" 2372 self.assertEqual(S("4") + S("5"), "3") 2373 class S(str): 2374 def __iadd__(self, o): 2375 return "3" 2376 s = S("1") 2377 s += "4" 2378 self.assertEqual(s, "3") 2379 2380 def test_getnewargs(self): 2381 text = 'abc' 2382 args = text.__getnewargs__() 2383 self.assertIsNot(args[0], text) 2384 self.assertEqual(args[0], text) 2385 self.assertEqual(len(args), 1) 2386 2387 @support.cpython_only 2388 @support.requires_legacy_unicode_capi 2389 def test_resize(self): 2390 from _testcapi import getargs_u 2391 for length in range(1, 100, 7): 2392 # generate a fresh string (refcount=1) 2393 text = 'a' * length + 'b' 2394 2395 # fill wstr internal field 2396 with self.assertWarns(DeprecationWarning): 2397 abc = getargs_u(text) 2398 self.assertEqual(abc, text) 2399 2400 # resize text: wstr field must be cleared and then recomputed 2401 text += 'c' 2402 with self.assertWarns(DeprecationWarning): 2403 abcdef = getargs_u(text) 2404 self.assertNotEqual(abc, abcdef) 2405 self.assertEqual(abcdef, text) 2406 2407 def test_compare(self): 2408 # Issue #17615 2409 N = 10 2410 ascii = 'a' * N 2411 ascii2 = 'z' * N 2412 latin = '\x80' * N 2413 latin2 = '\xff' * N 2414 bmp = '\u0100' * N 2415 bmp2 = '\uffff' * N 2416 astral = '\U00100000' * N 2417 astral2 = '\U0010ffff' * N 2418 strings = ( 2419 ascii, ascii2, 2420 latin, latin2, 2421 bmp, bmp2, 2422 astral, astral2) 2423 for text1, text2 in itertools.combinations(strings, 2): 2424 equal = (text1 is text2) 2425 self.assertEqual(text1 == text2, equal) 2426 self.assertEqual(text1 != text2, not equal) 2427 2428 if equal: 2429 self.assertTrue(text1 <= text2) 2430 self.assertTrue(text1 >= text2) 2431 2432 # text1 is text2: duplicate strings to skip the "str1 == str2" 2433 # optimization in unicode_compare_eq() and really compare 2434 # character per character 2435 copy1 = duplicate_string(text1) 2436 copy2 = duplicate_string(text2) 2437 self.assertIsNot(copy1, copy2) 2438 2439 self.assertTrue(copy1 == copy2) 2440 self.assertFalse(copy1 != copy2) 2441 2442 self.assertTrue(copy1 <= copy2) 2443 self.assertTrue(copy2 >= copy2) 2444 2445 self.assertTrue(ascii < ascii2) 2446 self.assertTrue(ascii < latin) 2447 self.assertTrue(ascii < bmp) 2448 self.assertTrue(ascii < astral) 2449 self.assertFalse(ascii >= ascii2) 2450 self.assertFalse(ascii >= latin) 2451 self.assertFalse(ascii >= bmp) 2452 self.assertFalse(ascii >= astral) 2453 2454 self.assertFalse(latin < ascii) 2455 self.assertTrue(latin < latin2) 2456 self.assertTrue(latin < bmp) 2457 self.assertTrue(latin < astral) 2458 self.assertTrue(latin >= ascii) 2459 self.assertFalse(latin >= latin2) 2460 self.assertFalse(latin >= bmp) 2461 self.assertFalse(latin >= astral) 2462 2463 self.assertFalse(bmp < ascii) 2464 self.assertFalse(bmp < latin) 2465 self.assertTrue(bmp < bmp2) 2466 self.assertTrue(bmp < astral) 2467 self.assertTrue(bmp >= ascii) 2468 self.assertTrue(bmp >= latin) 2469 self.assertFalse(bmp >= bmp2) 2470 self.assertFalse(bmp >= astral) 2471 2472 self.assertFalse(astral < ascii) 2473 self.assertFalse(astral < latin) 2474 self.assertFalse(astral < bmp2) 2475 self.assertTrue(astral < astral2) 2476 self.assertTrue(astral >= ascii) 2477 self.assertTrue(astral >= latin) 2478 self.assertTrue(astral >= bmp2) 2479 self.assertFalse(astral >= astral2) 2480 2481 def test_free_after_iterating(self): 2482 support.check_free_after_iterating(self, iter, str) 2483 support.check_free_after_iterating(self, reversed, str) 2484 2485 def test_check_encoding_errors(self): 2486 # bpo-37388: str(bytes) and str.decode() must check encoding and errors 2487 # arguments in dev mode 2488 encodings = ('ascii', 'utf8', 'latin1') 2489 invalid = 'Boom, Shaka Laka, Boom!' 2490 code = textwrap.dedent(f''' 2491 import sys 2492 encodings = {encodings!r} 2493 2494 for data in (b'', b'short string'): 2495 try: 2496 str(data, encoding={invalid!r}) 2497 except LookupError: 2498 pass 2499 else: 2500 sys.exit(21) 2501 2502 try: 2503 str(data, errors={invalid!r}) 2504 except LookupError: 2505 pass 2506 else: 2507 sys.exit(22) 2508 2509 for encoding in encodings: 2510 try: 2511 str(data, encoding, errors={invalid!r}) 2512 except LookupError: 2513 pass 2514 else: 2515 sys.exit(22) 2516 2517 for data in ('', 'short string'): 2518 try: 2519 data.encode(encoding={invalid!r}) 2520 except LookupError: 2521 pass 2522 else: 2523 sys.exit(23) 2524 2525 try: 2526 data.encode(errors={invalid!r}) 2527 except LookupError: 2528 pass 2529 else: 2530 sys.exit(24) 2531 2532 for encoding in encodings: 2533 try: 2534 data.encode(encoding, errors={invalid!r}) 2535 except LookupError: 2536 pass 2537 else: 2538 sys.exit(24) 2539 2540 sys.exit(10) 2541 ''') 2542 proc = assert_python_failure('-X', 'dev', '-c', code) 2543 self.assertEqual(proc.rc, 10, proc) 2544 2545 2546class CAPITest(unittest.TestCase): 2547 2548 # Test PyUnicode_FromFormat() 2549 def test_from_format(self): 2550 import_helper.import_module('ctypes') 2551 from ctypes import ( 2552 c_char_p, 2553 pythonapi, py_object, sizeof, 2554 c_int, c_long, c_longlong, c_ssize_t, 2555 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) 2556 name = "PyUnicode_FromFormat" 2557 _PyUnicode_FromFormat = getattr(pythonapi, name) 2558 _PyUnicode_FromFormat.argtypes = (c_char_p,) 2559 _PyUnicode_FromFormat.restype = py_object 2560 2561 def PyUnicode_FromFormat(format, *args): 2562 cargs = tuple( 2563 py_object(arg) if isinstance(arg, str) else arg 2564 for arg in args) 2565 return _PyUnicode_FromFormat(format, *cargs) 2566 2567 def check_format(expected, format, *args): 2568 text = PyUnicode_FromFormat(format, *args) 2569 self.assertEqual(expected, text) 2570 2571 # ascii format, non-ascii argument 2572 check_format('ascii\x7f=unicode\xe9', 2573 b'ascii\x7f=%U', 'unicode\xe9') 2574 2575 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() 2576 # raises an error 2577 self.assertRaisesRegex(ValueError, 2578 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' 2579 'string, got a non-ASCII byte: 0xe9$', 2580 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') 2581 2582 # test "%c" 2583 check_format('\uabcd', 2584 b'%c', c_int(0xabcd)) 2585 check_format('\U0010ffff', 2586 b'%c', c_int(0x10ffff)) 2587 with self.assertRaises(OverflowError): 2588 PyUnicode_FromFormat(b'%c', c_int(0x110000)) 2589 # Issue #18183 2590 check_format('\U00010000\U00100000', 2591 b'%c%c', c_int(0x10000), c_int(0x100000)) 2592 2593 # test "%" 2594 check_format('%', 2595 b'%') 2596 check_format('%', 2597 b'%%') 2598 check_format('%s', 2599 b'%%s') 2600 check_format('[%]', 2601 b'[%%]') 2602 check_format('%abc', 2603 b'%%%s', b'abc') 2604 2605 # truncated string 2606 check_format('abc', 2607 b'%.3s', b'abcdef') 2608 check_format('abc[\ufffd', 2609 b'%.5s', 'abc[\u20ac]'.encode('utf8')) 2610 check_format("'\\u20acABC'", 2611 b'%A', '\u20acABC') 2612 check_format("'\\u20", 2613 b'%.5A', '\u20acABCDEF') 2614 check_format("'\u20acABC'", 2615 b'%R', '\u20acABC') 2616 check_format("'\u20acA", 2617 b'%.3R', '\u20acABCDEF') 2618 check_format('\u20acAB', 2619 b'%.3S', '\u20acABCDEF') 2620 check_format('\u20acAB', 2621 b'%.3U', '\u20acABCDEF') 2622 check_format('\u20acAB', 2623 b'%.3V', '\u20acABCDEF', None) 2624 check_format('abc[\ufffd', 2625 b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) 2626 2627 # following tests comes from #7330 2628 # test width modifier and precision modifier with %S 2629 check_format("repr= abc", 2630 b'repr=%5S', 'abc') 2631 check_format("repr=ab", 2632 b'repr=%.2S', 'abc') 2633 check_format("repr= ab", 2634 b'repr=%5.2S', 'abc') 2635 2636 # test width modifier and precision modifier with %R 2637 check_format("repr= 'abc'", 2638 b'repr=%8R', 'abc') 2639 check_format("repr='ab", 2640 b'repr=%.3R', 'abc') 2641 check_format("repr= 'ab", 2642 b'repr=%5.3R', 'abc') 2643 2644 # test width modifier and precision modifier with %A 2645 check_format("repr= 'abc'", 2646 b'repr=%8A', 'abc') 2647 check_format("repr='ab", 2648 b'repr=%.3A', 'abc') 2649 check_format("repr= 'ab", 2650 b'repr=%5.3A', 'abc') 2651 2652 # test width modifier and precision modifier with %s 2653 check_format("repr= abc", 2654 b'repr=%5s', b'abc') 2655 check_format("repr=ab", 2656 b'repr=%.2s', b'abc') 2657 check_format("repr= ab", 2658 b'repr=%5.2s', b'abc') 2659 2660 # test width modifier and precision modifier with %U 2661 check_format("repr= abc", 2662 b'repr=%5U', 'abc') 2663 check_format("repr=ab", 2664 b'repr=%.2U', 'abc') 2665 check_format("repr= ab", 2666 b'repr=%5.2U', 'abc') 2667 2668 # test width modifier and precision modifier with %V 2669 check_format("repr= abc", 2670 b'repr=%5V', 'abc', b'123') 2671 check_format("repr=ab", 2672 b'repr=%.2V', 'abc', b'123') 2673 check_format("repr= ab", 2674 b'repr=%5.2V', 'abc', b'123') 2675 check_format("repr= 123", 2676 b'repr=%5V', None, b'123') 2677 check_format("repr=12", 2678 b'repr=%.2V', None, b'123') 2679 check_format("repr= 12", 2680 b'repr=%5.2V', None, b'123') 2681 2682 # test integer formats (%i, %d, %u) 2683 check_format('010', 2684 b'%03i', c_int(10)) 2685 check_format('0010', 2686 b'%0.4i', c_int(10)) 2687 check_format('-123', 2688 b'%i', c_int(-123)) 2689 check_format('-123', 2690 b'%li', c_long(-123)) 2691 check_format('-123', 2692 b'%lli', c_longlong(-123)) 2693 check_format('-123', 2694 b'%zi', c_ssize_t(-123)) 2695 2696 check_format('-123', 2697 b'%d', c_int(-123)) 2698 check_format('-123', 2699 b'%ld', c_long(-123)) 2700 check_format('-123', 2701 b'%lld', c_longlong(-123)) 2702 check_format('-123', 2703 b'%zd', c_ssize_t(-123)) 2704 2705 check_format('123', 2706 b'%u', c_uint(123)) 2707 check_format('123', 2708 b'%lu', c_ulong(123)) 2709 check_format('123', 2710 b'%llu', c_ulonglong(123)) 2711 check_format('123', 2712 b'%zu', c_size_t(123)) 2713 2714 # test long output 2715 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) 2716 max_longlong = -min_longlong - 1 2717 check_format(str(min_longlong), 2718 b'%lld', c_longlong(min_longlong)) 2719 check_format(str(max_longlong), 2720 b'%lld', c_longlong(max_longlong)) 2721 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 2722 check_format(str(max_ulonglong), 2723 b'%llu', c_ulonglong(max_ulonglong)) 2724 PyUnicode_FromFormat(b'%p', c_void_p(-1)) 2725 2726 # test padding (width and/or precision) 2727 check_format('123'.rjust(10, '0'), 2728 b'%010i', c_int(123)) 2729 check_format('123'.rjust(100), 2730 b'%100i', c_int(123)) 2731 check_format('123'.rjust(100, '0'), 2732 b'%.100i', c_int(123)) 2733 check_format('123'.rjust(80, '0').rjust(100), 2734 b'%100.80i', c_int(123)) 2735 2736 check_format('123'.rjust(10, '0'), 2737 b'%010u', c_uint(123)) 2738 check_format('123'.rjust(100), 2739 b'%100u', c_uint(123)) 2740 check_format('123'.rjust(100, '0'), 2741 b'%.100u', c_uint(123)) 2742 check_format('123'.rjust(80, '0').rjust(100), 2743 b'%100.80u', c_uint(123)) 2744 2745 check_format('123'.rjust(10, '0'), 2746 b'%010x', c_int(0x123)) 2747 check_format('123'.rjust(100), 2748 b'%100x', c_int(0x123)) 2749 check_format('123'.rjust(100, '0'), 2750 b'%.100x', c_int(0x123)) 2751 check_format('123'.rjust(80, '0').rjust(100), 2752 b'%100.80x', c_int(0x123)) 2753 2754 # test %A 2755 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", 2756 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') 2757 2758 # test %V 2759 check_format('repr=abc', 2760 b'repr=%V', 'abc', b'xyz') 2761 2762 # Test string decode from parameter of %s using utf-8. 2763 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of 2764 # '\u4eba\u6c11' 2765 check_format('repr=\u4eba\u6c11', 2766 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') 2767 2768 #Test replace error handler. 2769 check_format('repr=abc\ufffd', 2770 b'repr=%V', None, b'abc\xff') 2771 2772 # not supported: copy the raw format string. these tests are just here 2773 # to check for crashes and should not be considered as specifications 2774 check_format('%s', 2775 b'%1%s', b'abc') 2776 check_format('%1abc', 2777 b'%1abc') 2778 check_format('%+i', 2779 b'%+i', c_int(10)) 2780 check_format('%.%s', 2781 b'%.%s', b'abc') 2782 2783 # Issue #33817: empty strings 2784 check_format('', 2785 b'') 2786 check_format('', 2787 b'%s', b'') 2788 2789 # Test PyUnicode_AsWideChar() 2790 @support.cpython_only 2791 def test_aswidechar(self): 2792 from _testcapi import unicode_aswidechar 2793 import_helper.import_module('ctypes') 2794 from ctypes import c_wchar, sizeof 2795 2796 wchar, size = unicode_aswidechar('abcdef', 2) 2797 self.assertEqual(size, 2) 2798 self.assertEqual(wchar, 'ab') 2799 2800 wchar, size = unicode_aswidechar('abc', 3) 2801 self.assertEqual(size, 3) 2802 self.assertEqual(wchar, 'abc') 2803 2804 wchar, size = unicode_aswidechar('abc', 4) 2805 self.assertEqual(size, 3) 2806 self.assertEqual(wchar, 'abc\0') 2807 2808 wchar, size = unicode_aswidechar('abc', 10) 2809 self.assertEqual(size, 3) 2810 self.assertEqual(wchar, 'abc\0') 2811 2812 wchar, size = unicode_aswidechar('abc\0def', 20) 2813 self.assertEqual(size, 7) 2814 self.assertEqual(wchar, 'abc\0def\0') 2815 2816 nonbmp = chr(0x10ffff) 2817 if sizeof(c_wchar) == 2: 2818 buflen = 3 2819 nchar = 2 2820 else: # sizeof(c_wchar) == 4 2821 buflen = 2 2822 nchar = 1 2823 wchar, size = unicode_aswidechar(nonbmp, buflen) 2824 self.assertEqual(size, nchar) 2825 self.assertEqual(wchar, nonbmp + '\0') 2826 2827 # Test PyUnicode_AsWideCharString() 2828 @support.cpython_only 2829 def test_aswidecharstring(self): 2830 from _testcapi import unicode_aswidecharstring 2831 import_helper.import_module('ctypes') 2832 from ctypes import c_wchar, sizeof 2833 2834 wchar, size = unicode_aswidecharstring('abc') 2835 self.assertEqual(size, 3) 2836 self.assertEqual(wchar, 'abc\0') 2837 2838 wchar, size = unicode_aswidecharstring('abc\0def') 2839 self.assertEqual(size, 7) 2840 self.assertEqual(wchar, 'abc\0def\0') 2841 2842 nonbmp = chr(0x10ffff) 2843 if sizeof(c_wchar) == 2: 2844 nchar = 2 2845 else: # sizeof(c_wchar) == 4 2846 nchar = 1 2847 wchar, size = unicode_aswidecharstring(nonbmp) 2848 self.assertEqual(size, nchar) 2849 self.assertEqual(wchar, nonbmp + '\0') 2850 2851 # Test PyUnicode_AsUCS4() 2852 @support.cpython_only 2853 def test_asucs4(self): 2854 from _testcapi import unicode_asucs4 2855 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', 2856 'a\ud800b\udfffc', '\ud834\udd1e']: 2857 l = len(s) 2858 self.assertEqual(unicode_asucs4(s, l, True), s+'\0') 2859 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff') 2860 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff') 2861 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff') 2862 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True) 2863 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False) 2864 s = '\0'.join([s, s]) 2865 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') 2866 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') 2867 2868 # Test PyUnicode_AsUTF8() 2869 @support.cpython_only 2870 def test_asutf8(self): 2871 from _testcapi import unicode_asutf8 2872 2873 bmp = '\u0100' 2874 bmp2 = '\uffff' 2875 nonbmp = chr(0x10ffff) 2876 2877 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80') 2878 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf') 2879 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf') 2880 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc') 2881 2882 # Test PyUnicode_AsUTF8AndSize() 2883 @support.cpython_only 2884 def test_asutf8andsize(self): 2885 from _testcapi import unicode_asutf8andsize 2886 2887 bmp = '\u0100' 2888 bmp2 = '\uffff' 2889 nonbmp = chr(0x10ffff) 2890 2891 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2)) 2892 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3)) 2893 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4)) 2894 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc') 2895 2896 # Test PyUnicode_FindChar() 2897 @support.cpython_only 2898 def test_findchar(self): 2899 from _testcapi import unicode_findchar 2900 2901 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": 2902 for i, ch in enumerate(str): 2903 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) 2904 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) 2905 2906 str = "!>_<!" 2907 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1) 2908 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1) 2909 # start < end 2910 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4) 2911 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4) 2912 # start >= end 2913 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) 2914 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) 2915 # negative 2916 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) 2917 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) 2918 2919 # Test PyUnicode_CopyCharacters() 2920 @support.cpython_only 2921 def test_copycharacters(self): 2922 from _testcapi import unicode_copycharacters 2923 2924 strings = [ 2925 'abcde', '\xa1\xa2\xa3\xa4\xa5', 2926 '\u4f60\u597d\u4e16\u754c\uff01', 2927 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' 2928 ] 2929 2930 for idx, from_ in enumerate(strings): 2931 # wide -> narrow: exceed maxchar limitation 2932 for to in strings[:idx]: 2933 self.assertRaises( 2934 SystemError, 2935 unicode_copycharacters, to, 0, from_, 0, 5 2936 ) 2937 # same kind 2938 for from_start in range(5): 2939 self.assertEqual( 2940 unicode_copycharacters(from_, 0, from_, from_start, 5), 2941 (from_[from_start:from_start+5].ljust(5, '\0'), 2942 5-from_start) 2943 ) 2944 for to_start in range(5): 2945 self.assertEqual( 2946 unicode_copycharacters(from_, to_start, from_, to_start, 5), 2947 (from_[to_start:to_start+5].rjust(5, '\0'), 2948 5-to_start) 2949 ) 2950 # narrow -> wide 2951 # Tests omitted since this creates invalid strings. 2952 2953 s = strings[0] 2954 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) 2955 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) 2956 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) 2957 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) 2958 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) 2959 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) 2960 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) 2961 2962 @support.cpython_only 2963 @support.requires_legacy_unicode_capi 2964 def test_encode_decimal(self): 2965 from _testcapi import unicode_encodedecimal 2966 with warnings_helper.check_warnings(): 2967 warnings.simplefilter('ignore', DeprecationWarning) 2968 self.assertEqual(unicode_encodedecimal('123'), 2969 b'123') 2970 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), 2971 b'3.14') 2972 self.assertEqual(unicode_encodedecimal( 2973 "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ') 2974 self.assertRaises(UnicodeEncodeError, 2975 unicode_encodedecimal, "123\u20ac", "strict") 2976 self.assertRaisesRegex( 2977 ValueError, 2978 "^'decimal' codec can't encode character", 2979 unicode_encodedecimal, "123\u20ac", "replace") 2980 2981 @support.cpython_only 2982 @support.requires_legacy_unicode_capi 2983 def test_transform_decimal(self): 2984 from _testcapi import unicode_transformdecimaltoascii as transform_decimal 2985 with warnings_helper.check_warnings(): 2986 warnings.simplefilter('ignore', DeprecationWarning) 2987 self.assertEqual(transform_decimal('123'), 2988 '123') 2989 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), 2990 '3.14') 2991 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), 2992 "\N{EM SPACE}3.14\N{EN SPACE}") 2993 self.assertEqual(transform_decimal('123\u20ac'), 2994 '123\u20ac') 2995 2996 @support.cpython_only 2997 def test_pep393_utf8_caching_bug(self): 2998 # Issue #25709: Problem with string concatenation and utf-8 cache 2999 from _testcapi import getargs_s_hash 3000 for k in 0x24, 0xa4, 0x20ac, 0x1f40d: 3001 s = '' 3002 for i in range(5): 3003 # Due to CPython specific optimization the 's' string can be 3004 # resized in-place. 3005 s += chr(k) 3006 # Parsing with the "s#" format code calls indirectly 3007 # PyUnicode_AsUTF8AndSize() which creates the UTF-8 3008 # encoded string cached in the Unicode object. 3009 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 3010 # Check that the second call returns the same result 3011 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 3012 3013class StringModuleTest(unittest.TestCase): 3014 def test_formatter_parser(self): 3015 def parse(format): 3016 return list(_string.formatter_parser(format)) 3017 3018 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}") 3019 self.assertEqual(formatter, [ 3020 ('prefix ', '2', '', 's'), 3021 ('xxx', '0', '^+10.3f', None), 3022 ('', 'obj.attr', '', 's'), 3023 (' ', 'z[0]', '10', 's'), 3024 ]) 3025 3026 formatter = parse("prefix {} suffix") 3027 self.assertEqual(formatter, [ 3028 ('prefix ', '', '', None), 3029 (' suffix', None, None, None), 3030 ]) 3031 3032 formatter = parse("str") 3033 self.assertEqual(formatter, [ 3034 ('str', None, None, None), 3035 ]) 3036 3037 formatter = parse("") 3038 self.assertEqual(formatter, []) 3039 3040 formatter = parse("{0}") 3041 self.assertEqual(formatter, [ 3042 ('', '0', '', None), 3043 ]) 3044 3045 self.assertRaises(TypeError, _string.formatter_parser, 1) 3046 3047 def test_formatter_field_name_split(self): 3048 def split(name): 3049 items = list(_string.formatter_field_name_split(name)) 3050 items[1] = list(items[1]) 3051 return items 3052 self.assertEqual(split("obj"), ["obj", []]) 3053 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]]) 3054 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]]) 3055 self.assertEqual(split("obj.arg[key1][key2]"), [ 3056 "obj", 3057 [(True, 'arg'), 3058 (False, 'key1'), 3059 (False, 'key2'), 3060 ]]) 3061 self.assertRaises(TypeError, _string.formatter_field_name_split, 1) 3062 3063 3064if __name__ == "__main__": 3065 unittest.main() 3066