1""" Test script for the Unicode implementation. 2 3Written by Marc-Andre Lemburg (mal@lemburg.com). 4 5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8import _string 9import codecs 10import itertools 11import operator 12import struct 13import sys 14import textwrap 15import unicodedata 16import unittest 17import warnings 18from test import support, string_tests 19from test.support.script_helper import assert_python_failure 20 21# Error handling (bad decoder return) 22def search_function(encoding): 23 def decode1(input, errors="strict"): 24 return 42 # not a tuple 25 def encode1(input, errors="strict"): 26 return 42 # not a tuple 27 def encode2(input, errors="strict"): 28 return (42, 42) # no unicode 29 def decode2(input, errors="strict"): 30 return (42, 42) # no unicode 31 if encoding=="test.unicode1": 32 return (encode1, decode1, None, None) 33 elif encoding=="test.unicode2": 34 return (encode2, decode2, None, None) 35 else: 36 return None 37codecs.register(search_function) 38 39def duplicate_string(text): 40 """ 41 Try to get a fresh clone of the specified text: 42 new object with a reference count of 1. 43 44 This is a best-effort: latin1 single letters and the empty 45 string ('') are singletons and cannot be cloned. 46 """ 47 return text.encode().decode() 48 49class StrSubclass(str): 50 pass 51 52class UnicodeTest(string_tests.CommonTest, 53 string_tests.MixinStrUnicodeUserStringTest, 54 string_tests.MixinStrUnicodeTest, 55 unittest.TestCase): 56 57 type2test = str 58 59 def checkequalnofix(self, result, object, methodname, *args): 60 method = getattr(object, methodname) 61 realresult = method(*args) 62 self.assertEqual(realresult, result) 63 self.assertTrue(type(realresult) is type(result)) 64 65 # if the original is returned make sure that 66 # this doesn't happen with subclasses 67 if realresult is object: 68 class usub(str): 69 def __repr__(self): 70 return 'usub(%r)' % str.__repr__(self) 71 object = usub(object) 72 method = getattr(object, methodname) 73 realresult = method(*args) 74 self.assertEqual(realresult, result) 75 self.assertTrue(object is not realresult) 76 77 def test_literals(self): 78 self.assertEqual('\xff', '\u00ff') 79 self.assertEqual('\uffff', '\U0000ffff') 80 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'') 81 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'') 82 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000) 83 # raw strings should not have unicode escapes 84 self.assertNotEqual(r"\u0020", " ") 85 86 def test_ascii(self): 87 if not sys.platform.startswith('java'): 88 # Test basic sanity of repr() 89 self.assertEqual(ascii('abc'), "'abc'") 90 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'") 91 self.assertEqual(ascii('ab\\'), "'ab\\\\'") 92 self.assertEqual(ascii('\\c'), "'\\\\c'") 93 self.assertEqual(ascii('\\'), "'\\\\'") 94 self.assertEqual(ascii('\n'), "'\\n'") 95 self.assertEqual(ascii('\r'), "'\\r'") 96 self.assertEqual(ascii('\t'), "'\\t'") 97 self.assertEqual(ascii('\b'), "'\\x08'") 98 self.assertEqual(ascii("'\""), """'\\'"'""") 99 self.assertEqual(ascii("'\""), """'\\'"'""") 100 self.assertEqual(ascii("'"), '''"'"''') 101 self.assertEqual(ascii('"'), """'"'""") 102 latin1repr = ( 103 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 104 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 105 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 106 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 107 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 108 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 109 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" 110 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" 111 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" 112 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" 113 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" 114 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" 115 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" 116 "\\xfe\\xff'") 117 testrepr = ascii(''.join(map(chr, range(256)))) 118 self.assertEqual(testrepr, latin1repr) 119 # Test ascii works on wide unicode escapes without overflow. 120 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096), 121 ascii("\U00010000" * 39 + "\uffff" * 4096)) 122 123 class WrongRepr: 124 def __repr__(self): 125 return b'byte-repr' 126 self.assertRaises(TypeError, ascii, WrongRepr()) 127 128 def test_repr(self): 129 if not sys.platform.startswith('java'): 130 # Test basic sanity of repr() 131 self.assertEqual(repr('abc'), "'abc'") 132 self.assertEqual(repr('ab\\c'), "'ab\\\\c'") 133 self.assertEqual(repr('ab\\'), "'ab\\\\'") 134 self.assertEqual(repr('\\c'), "'\\\\c'") 135 self.assertEqual(repr('\\'), "'\\\\'") 136 self.assertEqual(repr('\n'), "'\\n'") 137 self.assertEqual(repr('\r'), "'\\r'") 138 self.assertEqual(repr('\t'), "'\\t'") 139 self.assertEqual(repr('\b'), "'\\x08'") 140 self.assertEqual(repr("'\""), """'\\'"'""") 141 self.assertEqual(repr("'\""), """'\\'"'""") 142 self.assertEqual(repr("'"), '''"'"''') 143 self.assertEqual(repr('"'), """'"'""") 144 latin1repr = ( 145 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 146 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 147 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 148 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 149 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 150 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 151 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9" 152 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" 153 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5" 154 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3" 155 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1" 156 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" 157 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd" 158 "\xfe\xff'") 159 testrepr = repr(''.join(map(chr, range(256)))) 160 self.assertEqual(testrepr, latin1repr) 161 # Test repr works on wide unicode escapes without overflow. 162 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096), 163 repr("\U00010000" * 39 + "\uffff" * 4096)) 164 165 class WrongRepr: 166 def __repr__(self): 167 return b'byte-repr' 168 self.assertRaises(TypeError, repr, WrongRepr()) 169 170 def test_iterators(self): 171 # Make sure unicode objects have an __iter__ method 172 it = "\u1111\u2222\u3333".__iter__() 173 self.assertEqual(next(it), "\u1111") 174 self.assertEqual(next(it), "\u2222") 175 self.assertEqual(next(it), "\u3333") 176 self.assertRaises(StopIteration, next, it) 177 178 def test_count(self): 179 string_tests.CommonTest.test_count(self) 180 # check mixed argument types 181 self.checkequalnofix(3, 'aaa', 'count', 'a') 182 self.checkequalnofix(0, 'aaa', 'count', 'b') 183 self.checkequalnofix(3, 'aaa', 'count', 'a') 184 self.checkequalnofix(0, 'aaa', 'count', 'b') 185 self.checkequalnofix(0, 'aaa', 'count', 'b') 186 self.checkequalnofix(1, 'aaa', 'count', 'a', -1) 187 self.checkequalnofix(3, 'aaa', 'count', 'a', -10) 188 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1) 189 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10) 190 # test mixed kinds 191 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a') 192 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a') 193 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102') 194 self.checkequal(0, 'a' * 10, 'count', '\u0102') 195 self.checkequal(0, 'a' * 10, 'count', '\U00100304') 196 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304') 197 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_') 198 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_') 199 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_') 200 self.checkequal(0, 'a' * 10, 'count', 'a\u0102') 201 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304') 202 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304') 203 204 def test_find(self): 205 string_tests.CommonTest.test_find(self) 206 # test implementation details of the memchr fast path 207 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102') 208 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201') 209 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120') 210 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220') 211 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304') 212 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204') 213 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004') 214 # check mixed argument types 215 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc') 216 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1) 217 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4) 218 219 self.assertRaises(TypeError, 'hello'.find) 220 self.assertRaises(TypeError, 'hello'.find, 42) 221 # test mixed kinds 222 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a') 223 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a') 224 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102') 225 self.checkequal(-1, 'a' * 100, 'find', '\u0102') 226 self.checkequal(-1, 'a' * 100, 'find', '\U00100304') 227 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304') 228 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_') 229 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_') 230 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_') 231 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102') 232 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304') 233 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304') 234 235 def test_rfind(self): 236 string_tests.CommonTest.test_rfind(self) 237 # test implementation details of the memrchr fast path 238 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102') 239 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201') 240 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120') 241 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220') 242 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304') 243 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204') 244 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004') 245 # check mixed argument types 246 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc') 247 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 248 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 249 # test mixed kinds 250 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a') 251 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a') 252 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102') 253 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102') 254 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304') 255 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304') 256 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a') 257 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a') 258 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102') 259 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a') 260 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a') 261 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102') 262 263 def test_index(self): 264 string_tests.CommonTest.test_index(self) 265 self.checkequalnofix(0, 'abcdefghiabc', 'index', '') 266 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def') 267 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc') 268 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1) 269 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib') 270 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1) 271 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8) 272 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1) 273 # test mixed kinds 274 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a') 275 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a') 276 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102') 277 self.assertRaises(ValueError, ('a' * 100).index, '\u0102') 278 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304') 279 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304') 280 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_') 281 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_') 282 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_') 283 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102') 284 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304') 285 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304') 286 287 def test_rindex(self): 288 string_tests.CommonTest.test_rindex(self) 289 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '') 290 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def') 291 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc') 292 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1) 293 294 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib') 295 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1) 296 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1) 297 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8) 298 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1) 299 # test mixed kinds 300 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a') 301 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a') 302 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102') 303 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102') 304 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304') 305 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304') 306 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a') 307 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a') 308 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102') 309 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a') 310 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a') 311 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102') 312 313 def test_maketrans_translate(self): 314 # these work with plain translate() 315 self.checkequalnofix('bbbc', 'abababc', 'translate', 316 {ord('a'): None}) 317 self.checkequalnofix('iiic', 'abababc', 'translate', 318 {ord('a'): None, ord('b'): ord('i')}) 319 self.checkequalnofix('iiix', 'abababc', 'translate', 320 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'}) 321 self.checkequalnofix('c', 'abababc', 'translate', 322 {ord('a'): None, ord('b'): ''}) 323 self.checkequalnofix('xyyx', 'xzx', 'translate', 324 {ord('z'): 'yy'}) 325 326 # this needs maketrans() 327 self.checkequalnofix('abababc', 'abababc', 'translate', 328 {'b': '<i>'}) 329 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'}) 330 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl) 331 # test alternative way of calling maketrans() 332 tbl = self.type2test.maketrans('abc', 'xyz', 'd') 333 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl) 334 335 # various tests switching from ASCII to latin1 or the opposite; 336 # same length, remove a letter, or replace with a longer string. 337 self.assertEqual("[a]".translate(str.maketrans('a', 'X')), 338 "[X]") 339 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})), 340 "[X]") 341 self.assertEqual("[a]".translate(str.maketrans({'a': None})), 342 "[]") 343 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), 344 "[XXX]") 345 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), 346 "[\xe9]") 347 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), 348 "x123") 349 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), 350 "x\xe9") 351 352 # test non-ASCII (don't take the fast-path) 353 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), 354 "[<\xe9>]") 355 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})), 356 "[a]") 357 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})), 358 "[]") 359 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})), 360 "[123]") 361 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})), 362 "[<\u20ac>\xe9]") 363 364 # invalid Unicode characters 365 invalid_char = 0x10ffff+1 366 for before in "a\xe9\u20ac\U0010ffff": 367 mapping = str.maketrans({before: invalid_char}) 368 text = "[%s]" % before 369 self.assertRaises(ValueError, text.translate, mapping) 370 371 # errors 372 self.assertRaises(TypeError, self.type2test.maketrans) 373 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg') 374 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def') 375 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2) 376 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2) 377 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2}) 378 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2}) 379 380 self.assertRaises(TypeError, 'hello'.translate) 381 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz') 382 383 def test_split(self): 384 string_tests.CommonTest.test_split(self) 385 386 # test mixed kinds 387 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 388 left *= 9 389 right *= 9 390 for delim in ('c', '\u0102', '\U00010302'): 391 self.checkequal([left + right], 392 left + right, 'split', delim) 393 self.checkequal([left, right], 394 left + delim + right, 'split', delim) 395 self.checkequal([left + right], 396 left + right, 'split', delim * 2) 397 self.checkequal([left, right], 398 left + delim * 2 + right, 'split', delim *2) 399 400 def test_rsplit(self): 401 string_tests.CommonTest.test_rsplit(self) 402 # test mixed kinds 403 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 404 left *= 9 405 right *= 9 406 for delim in ('c', '\u0102', '\U00010302'): 407 self.checkequal([left + right], 408 left + right, 'rsplit', delim) 409 self.checkequal([left, right], 410 left + delim + right, 'rsplit', delim) 411 self.checkequal([left + right], 412 left + right, 'rsplit', delim * 2) 413 self.checkequal([left, right], 414 left + delim * 2 + right, 'rsplit', delim *2) 415 416 def test_partition(self): 417 string_tests.MixinStrUnicodeUserStringTest.test_partition(self) 418 # test mixed kinds 419 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200') 420 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 421 left *= 9 422 right *= 9 423 for delim in ('c', '\u0102', '\U00010302'): 424 self.checkequal((left + right, '', ''), 425 left + right, 'partition', delim) 426 self.checkequal((left, delim, right), 427 left + delim + right, 'partition', delim) 428 self.checkequal((left + right, '', ''), 429 left + right, 'partition', delim * 2) 430 self.checkequal((left, delim * 2, right), 431 left + delim * 2 + right, 'partition', delim * 2) 432 433 def test_rpartition(self): 434 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self) 435 # test mixed kinds 436 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200') 437 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 438 left *= 9 439 right *= 9 440 for delim in ('c', '\u0102', '\U00010302'): 441 self.checkequal(('', '', left + right), 442 left + right, 'rpartition', delim) 443 self.checkequal((left, delim, right), 444 left + delim + right, 'rpartition', delim) 445 self.checkequal(('', '', left + right), 446 left + right, 'rpartition', delim * 2) 447 self.checkequal((left, delim * 2, right), 448 left + delim * 2 + right, 'rpartition', delim * 2) 449 450 def test_join(self): 451 string_tests.MixinStrUnicodeUserStringTest.test_join(self) 452 453 class MyWrapper: 454 def __init__(self, sval): self.sval = sval 455 def __str__(self): return self.sval 456 457 # mixed arguments 458 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 459 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 460 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 461 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 462 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 463 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 464 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 465 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')]) 466 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()]) 467 self.checkraises(TypeError, ' ', 'join', [1, 2, 3]) 468 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3]) 469 470 @unittest.skipIf(sys.maxsize > 2**32, 471 'needs too much memory on a 64-bit platform') 472 def test_join_overflow(self): 473 size = int(sys.maxsize**0.5) + 1 474 seq = ('A' * size,) * size 475 self.assertRaises(OverflowError, ''.join, seq) 476 477 def test_replace(self): 478 string_tests.CommonTest.test_replace(self) 479 480 # method call forwarded from str implementation because of unicode argument 481 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1) 482 self.assertRaises(TypeError, 'replace'.replace, "r", 42) 483 # test mixed kinds 484 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 485 left *= 9 486 right *= 9 487 for delim in ('c', '\u0102', '\U00010302'): 488 for repl in ('d', '\u0103', '\U00010303'): 489 self.checkequal(left + right, 490 left + right, 'replace', delim, repl) 491 self.checkequal(left + repl + right, 492 left + delim + right, 493 'replace', delim, repl) 494 self.checkequal(left + right, 495 left + right, 'replace', delim * 2, repl) 496 self.checkequal(left + repl + right, 497 left + delim * 2 + right, 498 'replace', delim * 2, repl) 499 500 @support.cpython_only 501 def test_replace_id(self): 502 pattern = 'abc' 503 text = 'abc def' 504 self.assertIs(text.replace(pattern, pattern), text) 505 506 def test_bytes_comparison(self): 507 with support.check_warnings(): 508 warnings.simplefilter('ignore', BytesWarning) 509 self.assertEqual('abc' == b'abc', False) 510 self.assertEqual('abc' != b'abc', True) 511 self.assertEqual('abc' == bytearray(b'abc'), False) 512 self.assertEqual('abc' != bytearray(b'abc'), True) 513 514 def test_comparison(self): 515 # Comparisons: 516 self.assertEqual('abc', 'abc') 517 self.assertTrue('abcd' > 'abc') 518 self.assertTrue('abc' < 'abcd') 519 520 if 0: 521 # Move these tests to a Unicode collation module test... 522 # Testing UTF-16 code point order comparisons... 523 524 # No surrogates, no fixup required. 525 self.assertTrue('\u0061' < '\u20ac') 526 # Non surrogate below surrogate value, no fixup required 527 self.assertTrue('\u0061' < '\ud800\udc02') 528 529 # Non surrogate above surrogate value, fixup required 530 def test_lecmp(s, s2): 531 self.assertTrue(s < s2) 532 533 def test_fixup(s): 534 s2 = '\ud800\udc01' 535 test_lecmp(s, s2) 536 s2 = '\ud900\udc01' 537 test_lecmp(s, s2) 538 s2 = '\uda00\udc01' 539 test_lecmp(s, s2) 540 s2 = '\udb00\udc01' 541 test_lecmp(s, s2) 542 s2 = '\ud800\udd01' 543 test_lecmp(s, s2) 544 s2 = '\ud900\udd01' 545 test_lecmp(s, s2) 546 s2 = '\uda00\udd01' 547 test_lecmp(s, s2) 548 s2 = '\udb00\udd01' 549 test_lecmp(s, s2) 550 s2 = '\ud800\ude01' 551 test_lecmp(s, s2) 552 s2 = '\ud900\ude01' 553 test_lecmp(s, s2) 554 s2 = '\uda00\ude01' 555 test_lecmp(s, s2) 556 s2 = '\udb00\ude01' 557 test_lecmp(s, s2) 558 s2 = '\ud800\udfff' 559 test_lecmp(s, s2) 560 s2 = '\ud900\udfff' 561 test_lecmp(s, s2) 562 s2 = '\uda00\udfff' 563 test_lecmp(s, s2) 564 s2 = '\udb00\udfff' 565 test_lecmp(s, s2) 566 567 test_fixup('\ue000') 568 test_fixup('\uff61') 569 570 # Surrogates on both sides, no fixup required 571 self.assertTrue('\ud800\udc02' < '\ud84d\udc56') 572 573 def test_islower(self): 574 super().test_islower() 575 self.checkequalnofix(False, '\u1FFc', 'islower') 576 self.assertFalse('\u2167'.islower()) 577 self.assertTrue('\u2177'.islower()) 578 # non-BMP, uppercase 579 self.assertFalse('\U00010401'.islower()) 580 self.assertFalse('\U00010427'.islower()) 581 # non-BMP, lowercase 582 self.assertTrue('\U00010429'.islower()) 583 self.assertTrue('\U0001044E'.islower()) 584 # non-BMP, non-cased 585 self.assertFalse('\U0001F40D'.islower()) 586 self.assertFalse('\U0001F46F'.islower()) 587 588 def test_isupper(self): 589 super().test_isupper() 590 if not sys.platform.startswith('java'): 591 self.checkequalnofix(False, '\u1FFc', 'isupper') 592 self.assertTrue('\u2167'.isupper()) 593 self.assertFalse('\u2177'.isupper()) 594 # non-BMP, uppercase 595 self.assertTrue('\U00010401'.isupper()) 596 self.assertTrue('\U00010427'.isupper()) 597 # non-BMP, lowercase 598 self.assertFalse('\U00010429'.isupper()) 599 self.assertFalse('\U0001044E'.isupper()) 600 # non-BMP, non-cased 601 self.assertFalse('\U0001F40D'.isupper()) 602 self.assertFalse('\U0001F46F'.isupper()) 603 604 def test_istitle(self): 605 super().test_istitle() 606 self.checkequalnofix(True, '\u1FFc', 'istitle') 607 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle') 608 609 # non-BMP, uppercase + lowercase 610 self.assertTrue('\U00010401\U00010429'.istitle()) 611 self.assertTrue('\U00010427\U0001044E'.istitle()) 612 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 613 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: 614 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch)) 615 616 def test_isspace(self): 617 super().test_isspace() 618 self.checkequalnofix(True, '\u2000', 'isspace') 619 self.checkequalnofix(True, '\u200a', 'isspace') 620 self.checkequalnofix(False, '\u2014', 'isspace') 621 # There are no non-BMP whitespace chars as of Unicode 12. 622 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 623 '\U0001F40D', '\U0001F46F']: 624 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) 625 626 @support.requires_resource('cpu') 627 def test_isspace_invariant(self): 628 for codepoint in range(sys.maxunicode + 1): 629 char = chr(codepoint) 630 bidirectional = unicodedata.bidirectional(char) 631 category = unicodedata.category(char) 632 self.assertEqual(char.isspace(), 633 (bidirectional in ('WS', 'B', 'S') 634 or category == 'Zs')) 635 636 def test_isalnum(self): 637 super().test_isalnum() 638 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 639 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 640 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch)) 641 642 def test_isalpha(self): 643 super().test_isalpha() 644 self.checkequalnofix(True, '\u1FFc', 'isalpha') 645 # non-BMP, cased 646 self.assertTrue('\U00010401'.isalpha()) 647 self.assertTrue('\U00010427'.isalpha()) 648 self.assertTrue('\U00010429'.isalpha()) 649 self.assertTrue('\U0001044E'.isalpha()) 650 # non-BMP, non-cased 651 self.assertFalse('\U0001F40D'.isalpha()) 652 self.assertFalse('\U0001F46F'.isalpha()) 653 654 def test_isascii(self): 655 super().test_isascii() 656 self.assertFalse("\u20ac".isascii()) 657 self.assertFalse("\U0010ffff".isascii()) 658 659 def test_isdecimal(self): 660 self.checkequalnofix(False, '', 'isdecimal') 661 self.checkequalnofix(False, 'a', 'isdecimal') 662 self.checkequalnofix(True, '0', 'isdecimal') 663 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE 664 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER 665 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO 666 self.checkequalnofix(True, '0123456789', 'isdecimal') 667 self.checkequalnofix(False, '0123456789a', 'isdecimal') 668 669 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 670 671 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 672 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']: 673 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch)) 674 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']: 675 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) 676 677 def test_isdigit(self): 678 super().test_isdigit() 679 self.checkequalnofix(True, '\u2460', 'isdigit') 680 self.checkequalnofix(False, '\xbc', 'isdigit') 681 self.checkequalnofix(True, '\u0660', 'isdigit') 682 683 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 684 '\U0001F40D', '\U0001F46F', '\U00011065']: 685 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch)) 686 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 687 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch)) 688 689 def test_isnumeric(self): 690 self.checkequalnofix(False, '', 'isnumeric') 691 self.checkequalnofix(False, 'a', 'isnumeric') 692 self.checkequalnofix(True, '0', 'isnumeric') 693 self.checkequalnofix(True, '\u2460', 'isnumeric') 694 self.checkequalnofix(True, '\xbc', 'isnumeric') 695 self.checkequalnofix(True, '\u0660', 'isnumeric') 696 self.checkequalnofix(True, '0123456789', 'isnumeric') 697 self.checkequalnofix(False, '0123456789a', 'isnumeric') 698 699 self.assertRaises(TypeError, "abc".isnumeric, 42) 700 701 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 702 '\U0001F40D', '\U0001F46F']: 703 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch)) 704 for ch in ['\U00011065', '\U0001D7F6', '\U00011066', 705 '\U000104A0', '\U0001F107']: 706 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch)) 707 708 def test_isidentifier(self): 709 self.assertTrue("a".isidentifier()) 710 self.assertTrue("Z".isidentifier()) 711 self.assertTrue("_".isidentifier()) 712 self.assertTrue("b0".isidentifier()) 713 self.assertTrue("bc".isidentifier()) 714 self.assertTrue("b_".isidentifier()) 715 self.assertTrue("µ".isidentifier()) 716 self.assertTrue("".isidentifier()) 717 718 self.assertFalse(" ".isidentifier()) 719 self.assertFalse("[".isidentifier()) 720 self.assertFalse("©".isidentifier()) 721 self.assertFalse("0".isidentifier()) 722 723 @support.cpython_only 724 def test_isidentifier_legacy(self): 725 import _testcapi 726 u = '' 727 self.assertTrue(u.isidentifier()) 728 self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) 729 730 def test_isprintable(self): 731 self.assertTrue("".isprintable()) 732 self.assertTrue(" ".isprintable()) 733 self.assertTrue("abcdefg".isprintable()) 734 self.assertFalse("abcdefg\n".isprintable()) 735 # some defined Unicode character 736 self.assertTrue("\u0374".isprintable()) 737 # undefined character 738 self.assertFalse("\u0378".isprintable()) 739 # single surrogate character 740 self.assertFalse("\ud800".isprintable()) 741 742 self.assertTrue('\U0001F46F'.isprintable()) 743 self.assertFalse('\U000E0020'.isprintable()) 744 745 def test_surrogates(self): 746 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', 747 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 748 self.assertTrue(s.islower()) 749 self.assertFalse(s.isupper()) 750 self.assertFalse(s.istitle()) 751 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800', 752 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'): 753 self.assertFalse(s.islower()) 754 self.assertTrue(s.isupper()) 755 self.assertTrue(s.istitle()) 756 757 for meth_name in ('islower', 'isupper', 'istitle'): 758 meth = getattr(str, meth_name) 759 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'): 760 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 761 762 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', 763 'isdecimal', 'isnumeric', 764 'isidentifier', 'isprintable'): 765 meth = getattr(str, meth_name) 766 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 767 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 768 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 769 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 770 771 772 def test_lower(self): 773 string_tests.CommonTest.test_lower(self) 774 self.assertEqual('\U00010427'.lower(), '\U0001044F') 775 self.assertEqual('\U00010427\U00010427'.lower(), 776 '\U0001044F\U0001044F') 777 self.assertEqual('\U00010427\U0001044F'.lower(), 778 '\U0001044F\U0001044F') 779 self.assertEqual('X\U00010427x\U0001044F'.lower(), 780 'x\U0001044Fx\U0001044F') 781 self.assertEqual('fi'.lower(), 'fi') 782 self.assertEqual('\u0130'.lower(), '\u0069\u0307') 783 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 784 self.assertEqual('\u03a3'.lower(), '\u03c3') 785 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3') 786 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 787 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a') 788 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 789 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345') 790 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ') 791 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') 792 self.assertEqual('\u2177'.lower(), '\u2177') 793 794 def test_casefold(self): 795 self.assertEqual('hello'.casefold(), 'hello') 796 self.assertEqual('hELlo'.casefold(), 'hello') 797 self.assertEqual('ß'.casefold(), 'ss') 798 self.assertEqual('fi'.casefold(), 'fi') 799 self.assertEqual('\u03a3'.casefold(), '\u03c3') 800 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3') 801 self.assertEqual('\u00b5'.casefold(), '\u03bc') 802 803 def test_upper(self): 804 string_tests.CommonTest.test_upper(self) 805 self.assertEqual('\U0001044F'.upper(), '\U00010427') 806 self.assertEqual('\U0001044F\U0001044F'.upper(), 807 '\U00010427\U00010427') 808 self.assertEqual('\U00010427\U0001044F'.upper(), 809 '\U00010427\U00010427') 810 self.assertEqual('X\U00010427x\U0001044F'.upper(), 811 'X\U00010427X\U00010427') 812 self.assertEqual('fi'.upper(), 'FI') 813 self.assertEqual('\u0130'.upper(), '\u0130') 814 self.assertEqual('\u03a3'.upper(), '\u03a3') 815 self.assertEqual('ß'.upper(), 'SS') 816 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300') 817 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') 818 self.assertEqual('\u2177'.upper(), '\u2167') 819 820 def test_capitalize(self): 821 string_tests.CommonTest.test_capitalize(self) 822 self.assertEqual('\U0001044F'.capitalize(), '\U00010427') 823 self.assertEqual('\U0001044F\U0001044F'.capitalize(), 824 '\U00010427\U0001044F') 825 self.assertEqual('\U00010427\U0001044F'.capitalize(), 826 '\U00010427\U0001044F') 827 self.assertEqual('\U0001044F\U00010427'.capitalize(), 828 '\U00010427\U0001044F') 829 self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 830 'X\U0001044Fx\U0001044F') 831 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307') 832 exp = '\u0399\u0308\u0300\u0069\u0307' 833 self.assertEqual('\u1fd2\u0130'.capitalize(), exp) 834 self.assertEqual('finnish'.capitalize(), 'Finnish') 835 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') 836 837 def test_title(self): 838 super().test_title() 839 self.assertEqual('\U0001044F'.title(), '\U00010427') 840 self.assertEqual('\U0001044F\U0001044F'.title(), 841 '\U00010427\U0001044F') 842 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(), 843 '\U00010427\U0001044F \U00010427\U0001044F') 844 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(), 845 '\U00010427\U0001044F \U00010427\U0001044F') 846 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(), 847 '\U00010427\U0001044F \U00010427\U0001044F') 848 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 849 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 850 self.assertEqual('fiNNISH'.title(), 'Finnish') 851 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') 852 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') 853 854 def test_swapcase(self): 855 string_tests.CommonTest.test_swapcase(self) 856 self.assertEqual('\U0001044F'.swapcase(), '\U00010427') 857 self.assertEqual('\U00010427'.swapcase(), '\U0001044F') 858 self.assertEqual('\U0001044F\U0001044F'.swapcase(), 859 '\U00010427\U00010427') 860 self.assertEqual('\U00010427\U0001044F'.swapcase(), 861 '\U0001044F\U00010427') 862 self.assertEqual('\U0001044F\U00010427'.swapcase(), 863 '\U00010427\U0001044F') 864 self.assertEqual('X\U00010427x\U0001044F'.swapcase(), 865 'x\U0001044FX\U00010427') 866 self.assertEqual('fi'.swapcase(), 'FI') 867 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307') 868 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 869 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 870 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3') 871 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 872 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A') 873 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 874 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399') 875 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ') 876 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 877 self.assertEqual('ß'.swapcase(), 'SS') 878 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') 879 880 def test_center(self): 881 string_tests.CommonTest.test_center(self) 882 self.assertEqual('x'.center(2, '\U0010FFFF'), 883 'x\U0010FFFF') 884 self.assertEqual('x'.center(3, '\U0010FFFF'), 885 '\U0010FFFFx\U0010FFFF') 886 self.assertEqual('x'.center(4, '\U0010FFFF'), 887 '\U0010FFFFx\U0010FFFF\U0010FFFF') 888 889 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system") 890 @support.cpython_only 891 def test_case_operation_overflow(self): 892 # Issue #22643 893 size = 2**32//12 + 1 894 try: 895 s = "ü" * size 896 except MemoryError: 897 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20)) 898 try: 899 self.assertRaises(OverflowError, s.upper) 900 finally: 901 del s 902 903 def test_contains(self): 904 # Testing Unicode contains method 905 self.assertIn('a', 'abdb') 906 self.assertIn('a', 'bdab') 907 self.assertIn('a', 'bdaba') 908 self.assertIn('a', 'bdba') 909 self.assertNotIn('a', 'bdb') 910 self.assertIn('a', 'bdba') 911 self.assertIn('a', ('a',1,None)) 912 self.assertIn('a', (1,None,'a')) 913 self.assertIn('a', ('a',1,None)) 914 self.assertIn('a', (1,None,'a')) 915 self.assertNotIn('a', ('x',1,'y')) 916 self.assertNotIn('a', ('x',1,None)) 917 self.assertNotIn('abcd', 'abcxxxx') 918 self.assertIn('ab', 'abcd') 919 self.assertIn('ab', 'abc') 920 self.assertIn('ab', (1,None,'ab')) 921 self.assertIn('', 'abc') 922 self.assertIn('', '') 923 self.assertIn('', 'abc') 924 self.assertNotIn('\0', 'abc') 925 self.assertIn('\0', '\0abc') 926 self.assertIn('\0', 'abc\0') 927 self.assertIn('a', '\0abc') 928 self.assertIn('asdf', 'asdf') 929 self.assertNotIn('asdf', 'asd') 930 self.assertNotIn('asdf', '') 931 932 self.assertRaises(TypeError, "abc".__contains__) 933 # test mixed kinds 934 for fill in ('a', '\u0100', '\U00010300'): 935 fill *= 9 936 for delim in ('c', '\u0102', '\U00010302'): 937 self.assertNotIn(delim, fill) 938 self.assertIn(delim, fill + delim) 939 self.assertNotIn(delim * 2, fill) 940 self.assertIn(delim * 2, fill + delim * 2) 941 942 def test_issue18183(self): 943 '\U00010000\U00100000'.lower() 944 '\U00010000\U00100000'.casefold() 945 '\U00010000\U00100000'.upper() 946 '\U00010000\U00100000'.capitalize() 947 '\U00010000\U00100000'.title() 948 '\U00010000\U00100000'.swapcase() 949 '\U00100000'.center(3, '\U00010000') 950 '\U00100000'.ljust(3, '\U00010000') 951 '\U00100000'.rjust(3, '\U00010000') 952 953 def test_format(self): 954 self.assertEqual(''.format(), '') 955 self.assertEqual('a'.format(), 'a') 956 self.assertEqual('ab'.format(), 'ab') 957 self.assertEqual('a{{'.format(), 'a{') 958 self.assertEqual('a}}'.format(), 'a}') 959 self.assertEqual('{{b'.format(), '{b') 960 self.assertEqual('}}b'.format(), '}b') 961 self.assertEqual('a{{b'.format(), 'a{b') 962 963 # examples from the PEP: 964 import datetime 965 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred") 966 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')), 967 "My name is Fred") 968 self.assertEqual("My name is {0} :-{{}}".format('Fred'), 969 "My name is Fred :-{}") 970 971 d = datetime.date(2007, 8, 18) 972 self.assertEqual("The year is {0.year}".format(d), 973 "The year is 2007") 974 975 # classes we'll use for testing 976 class C: 977 def __init__(self, x=100): 978 self._x = x 979 def __format__(self, spec): 980 return spec 981 982 class D: 983 def __init__(self, x): 984 self.x = x 985 def __format__(self, spec): 986 return str(self.x) 987 988 # class with __str__, but no __format__ 989 class E: 990 def __init__(self, x): 991 self.x = x 992 def __str__(self): 993 return 'E(' + self.x + ')' 994 995 # class with __repr__, but no __format__ or __str__ 996 class F: 997 def __init__(self, x): 998 self.x = x 999 def __repr__(self): 1000 return 'F(' + self.x + ')' 1001 1002 # class with __format__ that forwards to string, for some format_spec's 1003 class G: 1004 def __init__(self, x): 1005 self.x = x 1006 def __str__(self): 1007 return "string is " + self.x 1008 def __format__(self, format_spec): 1009 if format_spec == 'd': 1010 return 'G(' + self.x + ')' 1011 return object.__format__(self, format_spec) 1012 1013 class I(datetime.date): 1014 def __format__(self, format_spec): 1015 return self.strftime(format_spec) 1016 1017 class J(int): 1018 def __format__(self, format_spec): 1019 return int.__format__(self * 2, format_spec) 1020 1021 class M: 1022 def __init__(self, x): 1023 self.x = x 1024 def __repr__(self): 1025 return 'M(' + self.x + ')' 1026 __str__ = None 1027 1028 class N: 1029 def __init__(self, x): 1030 self.x = x 1031 def __repr__(self): 1032 return 'N(' + self.x + ')' 1033 __format__ = None 1034 1035 self.assertEqual(''.format(), '') 1036 self.assertEqual('abc'.format(), 'abc') 1037 self.assertEqual('{0}'.format('abc'), 'abc') 1038 self.assertEqual('{0:}'.format('abc'), 'abc') 1039# self.assertEqual('{ 0 }'.format('abc'), 'abc') 1040 self.assertEqual('X{0}'.format('abc'), 'Xabc') 1041 self.assertEqual('{0}X'.format('abc'), 'abcX') 1042 self.assertEqual('X{0}Y'.format('abc'), 'XabcY') 1043 self.assertEqual('{1}'.format(1, 'abc'), 'abc') 1044 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc') 1045 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX') 1046 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY') 1047 self.assertEqual('{0}'.format(-15), '-15') 1048 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc') 1049 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc') 1050 self.assertEqual('{{'.format(), '{') 1051 self.assertEqual('}}'.format(), '}') 1052 self.assertEqual('{{}}'.format(), '{}') 1053 self.assertEqual('{{x}}'.format(), '{x}') 1054 self.assertEqual('{{{0}}}'.format(123), '{123}') 1055 self.assertEqual('{{{{0}}}}'.format(), '{{0}}') 1056 self.assertEqual('}}{{'.format(), '}{') 1057 self.assertEqual('}}x{{'.format(), '}x{') 1058 1059 # weird field names 1060 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz') 1061 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz') 1062 self.assertEqual("{0[ ]}".format({' ':3}), '3') 1063 1064 self.assertEqual('{foo._x}'.format(foo=C(20)), '20') 1065 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010') 1066 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc') 1067 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc') 1068 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def') 1069 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def') 1070 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def') 1071 1072 # strings 1073 self.assertEqual('{0:.3s}'.format('abc'), 'abc') 1074 self.assertEqual('{0:.3s}'.format('ab'), 'ab') 1075 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc') 1076 self.assertEqual('{0:.0s}'.format('abcdef'), '') 1077 self.assertEqual('{0:3.3s}'.format('abc'), 'abc') 1078 self.assertEqual('{0:2.3s}'.format('abc'), 'abc') 1079 self.assertEqual('{0:2.2s}'.format('abc'), 'ab') 1080 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ') 1081 self.assertEqual('{0:x<0s}'.format('result'), 'result') 1082 self.assertEqual('{0:x<5s}'.format('result'), 'result') 1083 self.assertEqual('{0:x<6s}'.format('result'), 'result') 1084 self.assertEqual('{0:x<7s}'.format('result'), 'resultx') 1085 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx') 1086 self.assertEqual('{0: <7s}'.format('result'), 'result ') 1087 self.assertEqual('{0:<7s}'.format('result'), 'result ') 1088 self.assertEqual('{0:>7s}'.format('result'), ' result') 1089 self.assertEqual('{0:>8s}'.format('result'), ' result') 1090 self.assertEqual('{0:^8s}'.format('result'), ' result ') 1091 self.assertEqual('{0:^9s}'.format('result'), ' result ') 1092 self.assertEqual('{0:^10s}'.format('result'), ' result ') 1093 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999) 1094 self.assertEqual('{0:10000}'.format(''), ' ' * 10000) 1095 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000) 1096 1097 # issue 12546: use \x00 as a fill character 1098 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00') 1099 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01') 1100 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00') 1101 self.assertEqual('{0:^6s}'.format('foo'), ' foo ') 1102 1103 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00') 1104 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01') 1105 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00') 1106 self.assertEqual('{0:<6}'.format(3), '3 ') 1107 1108 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00') 1109 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01') 1110 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00') 1111 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ') 1112 1113 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00') 1114 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01') 1115 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00') 1116 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ') 1117 1118 # format specifiers for user defined type 1119 self.assertEqual('{0:abc}'.format(C()), 'abc') 1120 1121 # !r, !s and !a coercions 1122 self.assertEqual('{0!s}'.format('Hello'), 'Hello') 1123 self.assertEqual('{0!s:}'.format('Hello'), 'Hello') 1124 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ') 1125 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ') 1126 self.assertEqual('{0!r}'.format('Hello'), "'Hello'") 1127 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'") 1128 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)') 1129 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable 1130 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable 1131 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)') 1132 self.assertEqual('{0!a}'.format('Hello'), "'Hello'") 1133 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable 1134 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable 1135 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'") 1136 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)') 1137 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)') 1138 1139 # test fallback to object.__format__ 1140 self.assertEqual('{0}'.format({}), '{}') 1141 self.assertEqual('{0}'.format([]), '[]') 1142 self.assertEqual('{0}'.format([1]), '[1]') 1143 1144 self.assertEqual('{0:d}'.format(G('data')), 'G(data)') 1145 self.assertEqual('{0!s}'.format(G('data')), 'string is data') 1146 1147 self.assertRaises(TypeError, '{0:^10}'.format, E('data')) 1148 self.assertRaises(TypeError, '{0:^10s}'.format, E('data')) 1149 self.assertRaises(TypeError, '{0:>15s}'.format, G('data')) 1150 1151 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007, 1152 month=8, 1153 day=27)), 1154 "date: 2007-08-27") 1155 1156 # test deriving from a builtin type and overriding __format__ 1157 self.assertEqual("{0}".format(J(10)), "20") 1158 1159 1160 # string format specifiers 1161 self.assertEqual('{0:}'.format('a'), 'a') 1162 1163 # computed format specifiers 1164 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello') 1165 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello') 1166 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello') 1167 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ') 1168 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ') 1169 1170 # test various errors 1171 self.assertRaises(ValueError, '{'.format) 1172 self.assertRaises(ValueError, '}'.format) 1173 self.assertRaises(ValueError, 'a{'.format) 1174 self.assertRaises(ValueError, 'a}'.format) 1175 self.assertRaises(ValueError, '{a'.format) 1176 self.assertRaises(ValueError, '}a'.format) 1177 self.assertRaises(IndexError, '{0}'.format) 1178 self.assertRaises(IndexError, '{1}'.format, 'abc') 1179 self.assertRaises(KeyError, '{x}'.format) 1180 self.assertRaises(ValueError, "}{".format) 1181 self.assertRaises(ValueError, "abc{0:{}".format) 1182 self.assertRaises(ValueError, "{0".format) 1183 self.assertRaises(IndexError, "{0.}".format) 1184 self.assertRaises(ValueError, "{0.}".format, 0) 1185 self.assertRaises(ValueError, "{0[}".format) 1186 self.assertRaises(ValueError, "{0[}".format, []) 1187 self.assertRaises(KeyError, "{0]}".format) 1188 self.assertRaises(ValueError, "{0.[]}".format, 0) 1189 self.assertRaises(ValueError, "{0..foo}".format, 0) 1190 self.assertRaises(ValueError, "{0[0}".format, 0) 1191 self.assertRaises(ValueError, "{0[0:foo}".format, 0) 1192 self.assertRaises(KeyError, "{c]}".format) 1193 self.assertRaises(ValueError, "{{ {{{0}}".format, 0) 1194 self.assertRaises(ValueError, "{0}}".format, 0) 1195 self.assertRaises(KeyError, "{foo}".format, bar=3) 1196 self.assertRaises(ValueError, "{0!x}".format, 3) 1197 self.assertRaises(ValueError, "{0!}".format, 0) 1198 self.assertRaises(ValueError, "{0!rs}".format, 0) 1199 self.assertRaises(ValueError, "{!}".format) 1200 self.assertRaises(IndexError, "{:}".format) 1201 self.assertRaises(IndexError, "{:s}".format) 1202 self.assertRaises(IndexError, "{}".format) 1203 big = "23098475029384702983476098230754973209482573" 1204 self.assertRaises(ValueError, ("{" + big + "}").format) 1205 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0]) 1206 1207 # issue 6089 1208 self.assertRaises(ValueError, "{0[0]x}".format, [None]) 1209 self.assertRaises(ValueError, "{0[0](10)}".format, [None]) 1210 1211 # can't have a replacement on the field name portion 1212 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4) 1213 1214 # exceed maximum recursion depth 1215 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '') 1216 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format, 1217 0, 1, 2, 3, 4, 5, 6, 7) 1218 1219 # string format spec errors 1220 self.assertRaises(ValueError, "{0:-s}".format, '') 1221 self.assertRaises(ValueError, format, "", "-") 1222 self.assertRaises(ValueError, "{0:=s}".format, '') 1223 1224 # Alternate formatting is not supported 1225 self.assertRaises(ValueError, format, '', '#') 1226 self.assertRaises(ValueError, format, '', '#20') 1227 1228 # Non-ASCII 1229 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"), 1230 'ABC\u0410\u0411\u0412') 1231 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"), 1232 'ABC') 1233 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"), 1234 '') 1235 1236 self.assertEqual("{[{}]}".format({"{}": 5}), "5") 1237 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a") 1238 self.assertEqual("{[{]}".format({"{" : "a"}), "a") 1239 self.assertEqual("{[}]}".format({"}" : "a"}), "a") 1240 self.assertEqual("{[[]}".format({"[" : "a"}), "a") 1241 self.assertEqual("{[!]}".format({"!" : "a"}), "a") 1242 self.assertRaises(ValueError, "{a{}b}".format, 42) 1243 self.assertRaises(ValueError, "{a{b}".format, 42) 1244 self.assertRaises(ValueError, "{[}".format, 42) 1245 1246 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000") 1247 1248 # Blocking fallback 1249 m = M('data') 1250 self.assertEqual("{!r}".format(m), 'M(data)') 1251 self.assertRaises(TypeError, "{!s}".format, m) 1252 self.assertRaises(TypeError, "{}".format, m) 1253 n = N('data') 1254 self.assertEqual("{!r}".format(n), 'N(data)') 1255 self.assertEqual("{!s}".format(n), 'N(data)') 1256 self.assertRaises(TypeError, "{}".format, n) 1257 1258 def test_format_map(self): 1259 self.assertEqual(''.format_map({}), '') 1260 self.assertEqual('a'.format_map({}), 'a') 1261 self.assertEqual('ab'.format_map({}), 'ab') 1262 self.assertEqual('a{{'.format_map({}), 'a{') 1263 self.assertEqual('a}}'.format_map({}), 'a}') 1264 self.assertEqual('{{b'.format_map({}), '{b') 1265 self.assertEqual('}}b'.format_map({}), '}b') 1266 self.assertEqual('a{{b'.format_map({}), 'a{b') 1267 1268 # using mappings 1269 class Mapping(dict): 1270 def __missing__(self, key): 1271 return key 1272 self.assertEqual('{hello}'.format_map(Mapping()), 'hello') 1273 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world') 1274 1275 class InternalMapping: 1276 def __init__(self): 1277 self.mapping = {'a': 'hello'} 1278 def __getitem__(self, key): 1279 return self.mapping[key] 1280 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello') 1281 1282 1283 class C: 1284 def __init__(self, x=100): 1285 self._x = x 1286 def __format__(self, spec): 1287 return spec 1288 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20') 1289 1290 # test various errors 1291 self.assertRaises(TypeError, ''.format_map) 1292 self.assertRaises(TypeError, 'a'.format_map) 1293 1294 self.assertRaises(ValueError, '{'.format_map, {}) 1295 self.assertRaises(ValueError, '}'.format_map, {}) 1296 self.assertRaises(ValueError, 'a{'.format_map, {}) 1297 self.assertRaises(ValueError, 'a}'.format_map, {}) 1298 self.assertRaises(ValueError, '{a'.format_map, {}) 1299 self.assertRaises(ValueError, '}a'.format_map, {}) 1300 1301 # issue #12579: can't supply positional params to format_map 1302 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2}) 1303 self.assertRaises(ValueError, '{}'.format_map, 'a') 1304 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1}) 1305 1306 class BadMapping: 1307 def __getitem__(self, key): 1308 return 1/0 1309 self.assertRaises(KeyError, '{a}'.format_map, {}) 1310 self.assertRaises(TypeError, '{a}'.format_map, []) 1311 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping()) 1312 1313 def test_format_huge_precision(self): 1314 format_string = ".{}f".format(sys.maxsize + 1) 1315 with self.assertRaises(ValueError): 1316 result = format(2.34, format_string) 1317 1318 def test_format_huge_width(self): 1319 format_string = "{}f".format(sys.maxsize + 1) 1320 with self.assertRaises(ValueError): 1321 result = format(2.34, format_string) 1322 1323 def test_format_huge_item_number(self): 1324 format_string = "{{{}:.6f}}".format(sys.maxsize + 1) 1325 with self.assertRaises(ValueError): 1326 result = format_string.format(2.34) 1327 1328 def test_format_auto_numbering(self): 1329 class C: 1330 def __init__(self, x=100): 1331 self._x = x 1332 def __format__(self, spec): 1333 return spec 1334 1335 self.assertEqual('{}'.format(10), '10') 1336 self.assertEqual('{:5}'.format('s'), 's ') 1337 self.assertEqual('{!r}'.format('s'), "'s'") 1338 self.assertEqual('{._x}'.format(C(10)), '10') 1339 self.assertEqual('{[1]}'.format([1, 2]), '2') 1340 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4') 1341 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c') 1342 1343 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b') 1344 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b') 1345 1346 # can't mix and match numbering and auto-numbering 1347 self.assertRaises(ValueError, '{}{1}'.format, 1, 2) 1348 self.assertRaises(ValueError, '{1}{}'.format, 1, 2) 1349 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2) 1350 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2) 1351 1352 # can mix and match auto-numbering and named 1353 self.assertEqual('{f}{}'.format(4, f='test'), 'test4') 1354 self.assertEqual('{}{f}'.format(4, f='test'), '4test') 1355 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3') 1356 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g') 1357 1358 def test_formatting(self): 1359 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) 1360 # Testing Unicode formatting strings... 1361 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc') 1362 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00') 1363 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00') 1364 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50') 1365 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57') 1366 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57') 1367 if not sys.platform.startswith('java'): 1368 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'") 1369 self.assertEqual("%r" % ("\u1234",), "'\u1234'") 1370 self.assertEqual("%a" % ("\u1234",), "'\\u1234'") 1371 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def') 1372 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def') 1373 1374 self.assertEqual('%c' % 0x1234, '\u1234') 1375 self.assertEqual('%c' % 0x21483, '\U00021483') 1376 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) 1377 self.assertEqual('%c' % '\U00021483', '\U00021483') 1378 self.assertRaises(TypeError, "%c".__mod__, "aa") 1379 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3)) 1380 self.assertRaises(TypeError, "%i".__mod__, "aa") 1381 1382 # formatting jobs delegated from the string implementation: 1383 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1384 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1385 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1386 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1387 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1388 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1389 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...') 1390 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...') 1391 self.assertEqual('...%s...' % "abc", '...abc...') 1392 self.assertEqual('%*s' % (5,'abc',), ' abc') 1393 self.assertEqual('%*s' % (-5,'abc',), 'abc ') 1394 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab') 1395 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc') 1396 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc') 1397 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc') 1398 self.assertEqual('%c' % 'a', 'a') 1399 class Wrapper: 1400 def __str__(self): 1401 return '\u1234' 1402 self.assertEqual('%s' % Wrapper(), '\u1234') 1403 1404 # issue 3382 1405 NAN = float('nan') 1406 INF = float('inf') 1407 self.assertEqual('%f' % NAN, 'nan') 1408 self.assertEqual('%F' % NAN, 'NAN') 1409 self.assertEqual('%f' % INF, 'inf') 1410 self.assertEqual('%F' % INF, 'INF') 1411 1412 # PEP 393 1413 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') 1414 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') 1415 1416 #issue 19995 1417 class PseudoInt: 1418 def __init__(self, value): 1419 self.value = int(value) 1420 def __int__(self): 1421 return self.value 1422 def __index__(self): 1423 return self.value 1424 class PseudoFloat: 1425 def __init__(self, value): 1426 self.value = float(value) 1427 def __int__(self): 1428 return int(self.value) 1429 pi = PseudoFloat(3.1415) 1430 letter_m = PseudoInt(109) 1431 self.assertEqual('%x' % 42, '2a') 1432 self.assertEqual('%X' % 15, 'F') 1433 self.assertEqual('%o' % 9, '11') 1434 self.assertEqual('%c' % 109, 'm') 1435 self.assertEqual('%x' % letter_m, '6d') 1436 self.assertEqual('%X' % letter_m, '6D') 1437 self.assertEqual('%o' % letter_m, '155') 1438 self.assertEqual('%c' % letter_m, 'm') 1439 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14), 1440 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11), 1441 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79), 1442 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi), 1443 self.assertRaises(TypeError, operator.mod, '%c', pi), 1444 1445 def test_formatting_with_enum(self): 1446 # issue18780 1447 import enum 1448 class Float(float, enum.Enum): 1449 PI = 3.1415926 1450 class Int(enum.IntEnum): 1451 IDES = 15 1452 class Str(str, enum.Enum): 1453 ABC = 'abc' 1454 # Testing Unicode formatting strings... 1455 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC), 1456 'Str.ABC, Str.ABC') 1457 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" % 1458 (Str.ABC, Str.ABC, 1459 Int.IDES, Int.IDES, Int.IDES, 1460 Float.PI, Float.PI), 1461 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14') 1462 1463 # formatting jobs delegated from the string implementation: 1464 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC}, 1465 '...Str.ABC...') 1466 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES}, 1467 '...Int.IDES...') 1468 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES}, 1469 '...15...') 1470 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES}, 1471 '...15...') 1472 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI}, 1473 '...15...') 1474 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123}, 1475 '...3.141593...') 1476 1477 def test_formatting_huge_precision(self): 1478 format_string = "%.{}f".format(sys.maxsize + 1) 1479 with self.assertRaises(ValueError): 1480 result = format_string % 2.34 1481 1482 def test_issue28598_strsubclass_rhs(self): 1483 # A subclass of str with an __rmod__ method should be able to hook 1484 # into the % operator 1485 class SubclassedStr(str): 1486 def __rmod__(self, other): 1487 return 'Success, self.__rmod__({!r}) was called'.format(other) 1488 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'), 1489 "Success, self.__rmod__('lhs %% %r') was called") 1490 1491 @support.cpython_only 1492 def test_formatting_huge_precision_c_limits(self): 1493 from _testcapi import INT_MAX 1494 format_string = "%.{}f".format(INT_MAX + 1) 1495 with self.assertRaises(ValueError): 1496 result = format_string % 2.34 1497 1498 def test_formatting_huge_width(self): 1499 format_string = "%{}f".format(sys.maxsize + 1) 1500 with self.assertRaises(ValueError): 1501 result = format_string % 2.34 1502 1503 def test_startswith_endswith_errors(self): 1504 for meth in ('foo'.startswith, 'foo'.endswith): 1505 with self.assertRaises(TypeError) as cm: 1506 meth(['f']) 1507 exc = str(cm.exception) 1508 self.assertIn('str', exc) 1509 self.assertIn('tuple', exc) 1510 1511 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR') 1512 def test_format_float(self): 1513 # should not format with a comma, but always with C locale 1514 self.assertEqual('1.0', '%.1f' % 1.0) 1515 1516 def test_constructor(self): 1517 # unicode(obj) tests (this maps to PyObject_Unicode() at C level) 1518 1519 self.assertEqual( 1520 str('unicode remains unicode'), 1521 'unicode remains unicode' 1522 ) 1523 1524 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): 1525 subclass = StrSubclass(text) 1526 self.assertEqual(str(subclass), text) 1527 self.assertEqual(len(subclass), len(text)) 1528 if text == 'ascii': 1529 self.assertEqual(subclass.encode('ascii'), b'ascii') 1530 self.assertEqual(subclass.encode('utf-8'), b'ascii') 1531 1532 self.assertEqual( 1533 str('strings are converted to unicode'), 1534 'strings are converted to unicode' 1535 ) 1536 1537 class StringCompat: 1538 def __init__(self, x): 1539 self.x = x 1540 def __str__(self): 1541 return self.x 1542 1543 self.assertEqual( 1544 str(StringCompat('__str__ compatible objects are recognized')), 1545 '__str__ compatible objects are recognized' 1546 ) 1547 1548 # unicode(obj) is compatible to str(): 1549 1550 o = StringCompat('unicode(obj) is compatible to str()') 1551 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1552 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1553 1554 for obj in (123, 123.45, 123): 1555 self.assertEqual(str(obj), str(str(obj))) 1556 1557 # unicode(obj, encoding, error) tests (this maps to 1558 # PyUnicode_FromEncodedObject() at C level) 1559 1560 if not sys.platform.startswith('java'): 1561 self.assertRaises( 1562 TypeError, 1563 str, 1564 'decoding unicode is not supported', 1565 'utf-8', 1566 'strict' 1567 ) 1568 1569 self.assertEqual( 1570 str(b'strings are decoded to unicode', 'utf-8', 'strict'), 1571 'strings are decoded to unicode' 1572 ) 1573 1574 if not sys.platform.startswith('java'): 1575 self.assertEqual( 1576 str( 1577 memoryview(b'character buffers are decoded to unicode'), 1578 'utf-8', 1579 'strict' 1580 ), 1581 'character buffers are decoded to unicode' 1582 ) 1583 1584 self.assertRaises(TypeError, str, 42, 42, 42) 1585 1586 def test_constructor_keyword_args(self): 1587 """Pass various keyword argument combinations to the constructor.""" 1588 # The object argument can be passed as a keyword. 1589 self.assertEqual(str(object='foo'), 'foo') 1590 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo') 1591 # The errors argument without encoding triggers "decode" mode. 1592 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'" 1593 self.assertEqual(str(object=b'foo', errors='strict'), 'foo') 1594 1595 def test_constructor_defaults(self): 1596 """Check the constructor argument defaults.""" 1597 # The object argument defaults to '' or b''. 1598 self.assertEqual(str(), '') 1599 self.assertEqual(str(errors='strict'), '') 1600 utf8_cent = '¢'.encode('utf-8') 1601 # The encoding argument defaults to utf-8. 1602 self.assertEqual(str(utf8_cent, errors='strict'), '¢') 1603 # The errors argument defaults to strict. 1604 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii') 1605 1606 def test_codecs_utf7(self): 1607 utfTests = [ 1608 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example 1609 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example 1610 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example 1611 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example 1612 ('+', b'+-'), 1613 ('+-', b'+--'), 1614 ('+?', b'+-?'), 1615 (r'\?', b'+AFw?'), 1616 ('+?', b'+-?'), 1617 (r'\\?', b'+AFwAXA?'), 1618 (r'\\\?', b'+AFwAXABc?'), 1619 (r'++--', b'+-+---'), 1620 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs 1621 ('/', b'/'), 1622 ] 1623 1624 for (x, y) in utfTests: 1625 self.assertEqual(x.encode('utf-7'), y) 1626 1627 # Unpaired surrogates are passed through 1628 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-') 1629 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x') 1630 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-') 1631 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x') 1632 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801') 1633 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x') 1634 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01') 1635 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x') 1636 1637 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-') 1638 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') 1639 1640 # Issue #2242: crash on some Windows/MSVC versions 1641 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '') 1642 1643 # Direct encoded characters 1644 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" 1645 # Optional direct characters 1646 set_o = '!"#$%&*;<=>@[]^_`{|}' 1647 for c in set_d: 1648 self.assertEqual(c.encode('utf7'), c.encode('ascii')) 1649 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1650 for c in set_o: 1651 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1652 1653 with self.assertRaisesRegex(UnicodeDecodeError, 1654 'ill-formed sequence'): 1655 b'+@'.decode('utf-7') 1656 1657 def test_codecs_utf8(self): 1658 self.assertEqual(''.encode('utf-8'), b'') 1659 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') 1660 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') 1661 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') 1662 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') 1663 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') 1664 self.assertEqual(('\U00010002'*10).encode('utf-8'), 1665 b'\xf0\x90\x80\x82'*10) 1666 self.assertEqual( 1667 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' 1668 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' 1669 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' 1670 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' 1671 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' 1672 ' Nunstuck git und'.encode('utf-8'), 1673 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' 1674 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' 1675 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' 1676 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' 1677 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' 1678 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' 1679 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' 1680 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' 1681 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' 1682 b'\xe3\x80\x8cWenn ist das Nunstuck git und' 1683 ) 1684 1685 # UTF-8 specific decoding tests 1686 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' ) 1687 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' ) 1688 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' ) 1689 1690 # Other possible utf-8 test cases: 1691 # * strict decoding testing for all of the 1692 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 1693 1694 def test_utf8_decode_valid_sequences(self): 1695 sequences = [ 1696 # single byte 1697 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'), 1698 # 2 bytes 1699 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'), 1700 # 3 bytes 1701 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'), 1702 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'), 1703 # 4 bytes 1704 (b'\xF0\x90\x80\x80', '\U00010000'), 1705 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF') 1706 ] 1707 for seq, res in sequences: 1708 self.assertEqual(seq.decode('utf-8'), res) 1709 1710 1711 def test_utf8_decode_invalid_sequences(self): 1712 # continuation bytes in a sequence of 2, 3, or 4 bytes 1713 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] 1714 # start bytes of a 2-byte sequence equivalent to code points < 0x7F 1715 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] 1716 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF 1717 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] 1718 invalid_start_bytes = ( 1719 continuation_bytes + invalid_2B_seq_start_bytes + 1720 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] 1721 ) 1722 1723 for byte in invalid_start_bytes: 1724 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8') 1725 1726 for sb in invalid_2B_seq_start_bytes: 1727 for cb in continuation_bytes: 1728 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8') 1729 1730 for sb in invalid_4B_seq_start_bytes: 1731 for cb1 in continuation_bytes[:3]: 1732 for cb3 in continuation_bytes[:3]: 1733 self.assertRaises(UnicodeDecodeError, 1734 (sb+cb1+b'\x80'+cb3).decode, 'utf-8') 1735 1736 for cb in [bytes([x]) for x in range(0x80, 0xA0)]: 1737 self.assertRaises(UnicodeDecodeError, 1738 (b'\xE0'+cb+b'\x80').decode, 'utf-8') 1739 self.assertRaises(UnicodeDecodeError, 1740 (b'\xE0'+cb+b'\xBF').decode, 'utf-8') 1741 # surrogates 1742 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: 1743 self.assertRaises(UnicodeDecodeError, 1744 (b'\xED'+cb+b'\x80').decode, 'utf-8') 1745 self.assertRaises(UnicodeDecodeError, 1746 (b'\xED'+cb+b'\xBF').decode, 'utf-8') 1747 for cb in [bytes([x]) for x in range(0x80, 0x90)]: 1748 self.assertRaises(UnicodeDecodeError, 1749 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8') 1750 self.assertRaises(UnicodeDecodeError, 1751 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8') 1752 for cb in [bytes([x]) for x in range(0x90, 0xC0)]: 1753 self.assertRaises(UnicodeDecodeError, 1754 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8') 1755 self.assertRaises(UnicodeDecodeError, 1756 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8') 1757 1758 def test_issue8271(self): 1759 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, 1760 # only the start byte and the continuation byte(s) are now considered 1761 # invalid, instead of the number of bytes specified by the start byte. 1762 # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, 1763 # table 3-8, Row 2) for more information about the algorithm used. 1764 FFFD = '\ufffd' 1765 sequences = [ 1766 # invalid start bytes 1767 (b'\x80', FFFD), # continuation byte 1768 (b'\x80\x80', FFFD*2), # 2 continuation bytes 1769 (b'\xc0', FFFD), 1770 (b'\xc0\xc0', FFFD*2), 1771 (b'\xc1', FFFD), 1772 (b'\xc1\xc0', FFFD*2), 1773 (b'\xc0\xc1', FFFD*2), 1774 # with start byte of a 2-byte sequence 1775 (b'\xc2', FFFD), # only the start byte 1776 (b'\xc2\xc2', FFFD*2), # 2 start bytes 1777 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes 1778 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte 1779 # with start byte of a 3-byte sequence 1780 (b'\xe1', FFFD), # only the start byte 1781 (b'\xe1\xe1', FFFD*2), # 2 start bytes 1782 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes 1783 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes 1784 (b'\xe1\x80', FFFD), # only 1 continuation byte 1785 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte 1786 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb 1787 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes 1788 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte 1789 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid 1790 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid 1791 # with start byte of a 4-byte sequence 1792 (b'\xf1', FFFD), # only the start byte 1793 (b'\xf1\xf1', FFFD*2), # 2 start bytes 1794 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes 1795 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes 1796 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes 1797 (b'\xf1\x80', FFFD), # only 1 continuation bytes 1798 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes 1799 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid 1800 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid 1801 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid 1802 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid 1803 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid 1804 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid 1805 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid 1806 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD), 1807 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2), 1808 (b'\xf1\xf1\x80\x41', FFFD*2+'A'), 1809 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2), 1810 # with invalid start byte of a 4-byte sequence (rfc2279) 1811 (b'\xf5', FFFD), # only the start byte 1812 (b'\xf5\xf5', FFFD*2), # 2 start bytes 1813 (b'\xf5\x80', FFFD*2), # only 1 continuation byte 1814 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte 1815 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes 1816 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid 1817 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD), 1818 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'), 1819 # with invalid start byte of a 5-byte sequence (rfc2279) 1820 (b'\xf8', FFFD), # only the start byte 1821 (b'\xf8\xf8', FFFD*2), # 2 start bytes 1822 (b'\xf8\x80', FFFD*2), # only one continuation byte 1823 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid 1824 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes 1825 # with invalid start byte of a 6-byte sequence (rfc2279) 1826 (b'\xfc', FFFD), # only the start byte 1827 (b'\xfc\xfc', FFFD*2), # 2 start bytes 1828 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes 1829 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes 1830 # invalid start byte 1831 (b'\xfe', FFFD), 1832 (b'\xfe\x80\x80', FFFD*3), 1833 # other sequences 1834 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'), 1835 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'), 1836 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'), 1837 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', 1838 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), 1839 ] 1840 for n, (seq, res) in enumerate(sequences): 1841 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict') 1842 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1843 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b') 1844 self.assertEqual(seq.decode('utf-8', 'ignore'), 1845 res.replace('\uFFFD', '')) 1846 1847 def assertCorrectUTF8Decoding(self, seq, res, err): 1848 """ 1849 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when 1850 'strict' is used, returns res when 'replace' is used, and that doesn't 1851 return anything when 'ignore' is used. 1852 """ 1853 with self.assertRaises(UnicodeDecodeError) as cm: 1854 seq.decode('utf-8') 1855 exc = cm.exception 1856 1857 self.assertIn(err, str(exc)) 1858 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1859 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'), 1860 'aaaa' + res + 'bbbb') 1861 res = res.replace('\ufffd', '') 1862 self.assertEqual(seq.decode('utf-8', 'ignore'), res) 1863 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'), 1864 'aaaa' + res + 'bbbb') 1865 1866 def test_invalid_start_byte(self): 1867 """ 1868 Test that an 'invalid start byte' error is raised when the first byte 1869 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or 1870 4-bytes sequence. The invalid start byte is replaced with a single 1871 U+FFFD when errors='replace'. 1872 E.g. <80> is a continuation byte and can appear only after a start byte. 1873 """ 1874 FFFD = '\ufffd' 1875 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': 1876 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd', 1877 'invalid start byte') 1878 1879 def test_unexpected_end_of_data(self): 1880 """ 1881 Test that an 'unexpected end of data' error is raised when the string 1882 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having 1883 enough continuation bytes. The incomplete sequence is replaced with a 1884 single U+FFFD when errors='replace'. 1885 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes 1886 sequence, but it's followed by only 2 valid continuation bytes and the 1887 last continuation bytes is missing. 1888 Note: the continuation bytes must be all valid, if one of them is 1889 invalid another error will be raised. 1890 """ 1891 sequences = [ 1892 'C2', 'DF', 1893 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF', 1894 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF', 1895 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF', 1896 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF', 1897 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF', 1898 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' 1899 ] 1900 FFFD = '\ufffd' 1901 for seq in sequences: 1902 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd', 1903 'unexpected end of data') 1904 1905 def test_invalid_cb_for_2bytes_seq(self): 1906 """ 1907 Test that an 'invalid continuation byte' error is raised when the 1908 continuation byte of a 2-bytes sequence is invalid. The start byte 1909 is replaced by a single U+FFFD and the second byte is handled 1910 separately when errors='replace'. 1911 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes 1912 sequence, but 41 is not a valid continuation byte because it's the 1913 ASCII letter 'A'. 1914 """ 1915 FFFD = '\ufffd' 1916 FFFDx2 = FFFD * 2 1917 sequences = [ 1918 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'), 1919 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2), 1920 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'), 1921 ('DF C0', FFFDx2), ('DF FF', FFFDx2), 1922 ] 1923 for seq, res in sequences: 1924 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 1925 'invalid continuation byte') 1926 1927 def test_invalid_cb_for_3bytes_seq(self): 1928 """ 1929 Test that an 'invalid continuation byte' error is raised when the 1930 continuation byte(s) of a 3-bytes sequence are invalid. When 1931 errors='replace', if the first continuation byte is valid, the first 1932 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the 1933 third byte is handled separately, otherwise only the start byte is 1934 replaced with a U+FFFD and the other continuation bytes are handled 1935 separately. 1936 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 1937 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 1938 because it's the ASCII letter 'A'. 1939 Note: when the start byte is E0 or ED, the valid ranges for the first 1940 continuation byte are limited to A0..BF and 80..9F respectively. 1941 Python 2 used to consider all the bytes in range 80..BF valid when the 1942 start byte was ED. This is fixed in Python 3. 1943 """ 1944 FFFD = '\ufffd' 1945 FFFDx2 = FFFD * 2 1946 sequences = [ 1947 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2), 1948 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2), 1949 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'), 1950 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2), 1951 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'), 1952 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'), 1953 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2), 1954 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'), 1955 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2), 1956 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'), 1957 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'), 1958 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2), 1959 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'), 1960 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2), 1961 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'), 1962 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'), 1963 ('ED 7F', FFFD+'\x7f'), 1964 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^ 1965 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'), 1966 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2), 1967 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'), 1968 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2), 1969 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'), 1970 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2), 1971 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'), 1972 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2), 1973 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'), 1974 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'), 1975 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2), 1976 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'), 1977 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2), 1978 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'), 1979 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2), 1980 ] 1981 for seq, res in sequences: 1982 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 1983 'invalid continuation byte') 1984 1985 def test_invalid_cb_for_4bytes_seq(self): 1986 """ 1987 Test that an 'invalid continuation byte' error is raised when the 1988 continuation byte(s) of a 4-bytes sequence are invalid. When 1989 errors='replace',the start byte and all the following valid 1990 continuation bytes are replaced with a single U+FFFD, and all the bytes 1991 starting from the first invalid continuation bytes (included) are 1992 handled separately. 1993 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 1994 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 1995 because it's the ASCII letter 'A'. 1996 Note: when the start byte is E0 or ED, the valid ranges for the first 1997 continuation byte are limited to A0..BF and 80..9F respectively. 1998 However, when the start byte is ED, Python 2 considers all the bytes 1999 in range 80..BF valid. This is fixed in Python 3. 2000 """ 2001 FFFD = '\ufffd' 2002 FFFDx2 = FFFD * 2 2003 sequences = [ 2004 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2), 2005 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2), 2006 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'), 2007 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2), 2008 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'), 2009 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2), 2010 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'), 2011 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2), 2012 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'), 2013 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2), 2014 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'), 2015 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2), 2016 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'), 2017 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2), 2018 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2), 2019 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'), 2020 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2), 2021 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'), 2022 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2), 2023 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'), 2024 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2), 2025 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'), 2026 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2), 2027 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'), 2028 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2), 2029 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'), 2030 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2), 2031 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'), 2032 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2), 2033 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'), 2034 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2), 2035 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'), 2036 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2), 2037 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'), 2038 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2), 2039 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'), 2040 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2), 2041 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'), 2042 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2), 2043 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'), 2044 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2), 2045 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2), 2046 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2), 2047 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'), 2048 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2), 2049 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'), 2050 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2), 2051 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'), 2052 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2), 2053 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'), 2054 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2), 2055 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'), 2056 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2), 2057 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'), 2058 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2) 2059 ] 2060 for seq, res in sequences: 2061 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 2062 'invalid continuation byte') 2063 2064 def test_codecs_idna(self): 2065 # Test whether trailing dot is preserved 2066 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.") 2067 2068 def test_codecs_errors(self): 2069 # Error handling (encoding) 2070 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii') 2071 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict') 2072 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x") 2073 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x") 2074 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'), 2075 'Andr\202 x'.encode('ascii', errors='replace')) 2076 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'), 2077 'Andr\202 x'.encode(encoding='ascii', errors='ignore')) 2078 2079 # Error handling (decoding) 2080 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii') 2081 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') 2082 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") 2083 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') 2084 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x') 2085 2086 # Error handling (unknown character names) 2087 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") 2088 2089 # Error handling (truncated escape sequence) 2090 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape") 2091 2092 self.assertRaises(TypeError, b"hello".decode, "test.unicode1") 2093 self.assertRaises(TypeError, str, b"hello", "test.unicode2") 2094 self.assertRaises(TypeError, "hello".encode, "test.unicode1") 2095 self.assertRaises(TypeError, "hello".encode, "test.unicode2") 2096 2097 # Error handling (wrong arguments) 2098 self.assertRaises(TypeError, "hello".encode, 42, 42, 42) 2099 2100 # Error handling (lone surrogate in 2101 # _PyUnicode_TransformDecimalAndSpaceToASCII()) 2102 self.assertRaises(ValueError, int, "\ud800") 2103 self.assertRaises(ValueError, int, "\udf00") 2104 self.assertRaises(ValueError, float, "\ud800") 2105 self.assertRaises(ValueError, float, "\udf00") 2106 self.assertRaises(ValueError, complex, "\ud800") 2107 self.assertRaises(ValueError, complex, "\udf00") 2108 2109 def test_codecs(self): 2110 # Encoding 2111 self.assertEqual('hello'.encode('ascii'), b'hello') 2112 self.assertEqual('hello'.encode('utf-7'), b'hello') 2113 self.assertEqual('hello'.encode('utf-8'), b'hello') 2114 self.assertEqual('hello'.encode('utf-8'), b'hello') 2115 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000') 2116 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o') 2117 self.assertEqual('hello'.encode('latin-1'), b'hello') 2118 2119 # Default encoding is utf-8 2120 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83') 2121 2122 # Roundtrip safety for BMP (just the first 1024 chars) 2123 for c in range(1024): 2124 u = chr(c) 2125 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 2126 'utf-16-be', 'raw_unicode_escape', 2127 'unicode_escape'): 2128 self.assertEqual(str(u.encode(encoding),encoding), u) 2129 2130 # Roundtrip safety for BMP (just the first 256 chars) 2131 for c in range(256): 2132 u = chr(c) 2133 for encoding in ('latin-1',): 2134 self.assertEqual(str(u.encode(encoding),encoding), u) 2135 2136 # Roundtrip safety for BMP (just the first 128 chars) 2137 for c in range(128): 2138 u = chr(c) 2139 for encoding in ('ascii',): 2140 self.assertEqual(str(u.encode(encoding),encoding), u) 2141 2142 # Roundtrip safety for non-BMP (just a few chars) 2143 with warnings.catch_warnings(): 2144 u = '\U00010001\U00020002\U00030003\U00040004\U00050005' 2145 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 2146 'raw_unicode_escape', 'unicode_escape'): 2147 self.assertEqual(str(u.encode(encoding),encoding), u) 2148 2149 # UTF-8 must be roundtrip safe for all code points 2150 # (except surrogates, which are forbidden). 2151 u = ''.join(map(chr, list(range(0, 0xd800)) + 2152 list(range(0xe000, 0x110000)))) 2153 for encoding in ('utf-8',): 2154 self.assertEqual(str(u.encode(encoding),encoding), u) 2155 2156 def test_codecs_charmap(self): 2157 # 0-127 2158 s = bytes(range(128)) 2159 for encoding in ( 2160 'cp037', 'cp1026', 'cp273', 2161 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2162 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2163 'cp863', 'cp865', 'cp866', 'cp1125', 2164 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2165 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 2166 'iso8859_7', 'iso8859_9', 2167 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1', 2168 'mac_cyrillic', 'mac_latin2', 2169 2170 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2171 'cp1256', 'cp1257', 'cp1258', 2172 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2173 2174 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2175 'cp1006', 'iso8859_8', 2176 2177 ### These have undefined mappings: 2178 #'cp424', 2179 2180 ### These fail the round-trip: 2181 #'cp875' 2182 2183 ): 2184 self.assertEqual(str(s, encoding).encode(encoding), s) 2185 2186 # 128-255 2187 s = bytes(range(128, 256)) 2188 for encoding in ( 2189 'cp037', 'cp1026', 'cp273', 2190 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2191 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2192 'cp863', 'cp865', 'cp866', 'cp1125', 2193 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2194 'iso8859_2', 'iso8859_4', 'iso8859_5', 2195 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1', 2196 'mac_cyrillic', 'mac_latin2', 2197 2198 ### These have undefined mappings: 2199 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2200 #'cp1256', 'cp1257', 'cp1258', 2201 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2202 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048', 2203 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2204 2205 ### These fail the round-trip: 2206 #'cp1006', 'cp875', 'iso8859_8', 2207 2208 ): 2209 self.assertEqual(str(s, encoding).encode(encoding), s) 2210 2211 def test_concatenation(self): 2212 self.assertEqual(("abc" "def"), "abcdef") 2213 self.assertEqual(("abc" "def"), "abcdef") 2214 self.assertEqual(("abc" "def"), "abcdef") 2215 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2216 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2217 2218 def test_printing(self): 2219 class BitBucket: 2220 def write(self, text): 2221 pass 2222 2223 out = BitBucket() 2224 print('abc', file=out) 2225 print('abc', 'def', file=out) 2226 print('abc', 'def', file=out) 2227 print('abc', 'def', file=out) 2228 print('abc\n', file=out) 2229 print('abc\n', end=' ', file=out) 2230 print('abc\n', end=' ', file=out) 2231 print('def\n', file=out) 2232 print('def\n', file=out) 2233 2234 def test_ucs4(self): 2235 x = '\U00100000' 2236 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") 2237 self.assertEqual(x, y) 2238 2239 y = br'\U00100000' 2240 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2241 self.assertEqual(x, y) 2242 y = br'\U00010000' 2243 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2244 self.assertEqual(x, y) 2245 2246 try: 2247 br'\U11111111'.decode("raw-unicode-escape") 2248 except UnicodeDecodeError as e: 2249 self.assertEqual(e.start, 0) 2250 self.assertEqual(e.end, 10) 2251 else: 2252 self.fail("Should have raised UnicodeDecodeError") 2253 2254 def test_conversion(self): 2255 # Make sure __str__() works properly 2256 class ObjectToStr: 2257 def __str__(self): 2258 return "foo" 2259 2260 class StrSubclassToStr(str): 2261 def __str__(self): 2262 return "foo" 2263 2264 class StrSubclassToStrSubclass(str): 2265 def __new__(cls, content=""): 2266 return str.__new__(cls, 2*content) 2267 def __str__(self): 2268 return self 2269 2270 self.assertEqual(str(ObjectToStr()), "foo") 2271 self.assertEqual(str(StrSubclassToStr("bar")), "foo") 2272 s = str(StrSubclassToStrSubclass("foo")) 2273 self.assertEqual(s, "foofoo") 2274 self.assertIs(type(s), StrSubclassToStrSubclass) 2275 s = StrSubclass(StrSubclassToStrSubclass("foo")) 2276 self.assertEqual(s, "foofoo") 2277 self.assertIs(type(s), StrSubclass) 2278 2279 def test_unicode_repr(self): 2280 class s1: 2281 def __repr__(self): 2282 return '\\n' 2283 2284 class s2: 2285 def __repr__(self): 2286 return '\\n' 2287 2288 self.assertEqual(repr(s1()), '\\n') 2289 self.assertEqual(repr(s2()), '\\n') 2290 2291 def test_printable_repr(self): 2292 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable 2293 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable 2294 2295 # This test only affects 32-bit platforms because expandtabs can only take 2296 # an int as the max value, not a 64-bit C long. If expandtabs is changed 2297 # to take a 64-bit long, this test should apply to all platforms. 2298 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4, 2299 'only applies to 32-bit platforms') 2300 def test_expandtabs_overflows_gracefully(self): 2301 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize) 2302 2303 @support.cpython_only 2304 def test_expandtabs_optimization(self): 2305 s = 'abc' 2306 self.assertIs(s.expandtabs(), s) 2307 2308 def test_raiseMemError(self): 2309 if struct.calcsize('P') == 8: 2310 # 64 bits pointers 2311 ascii_struct_size = 48 2312 compact_struct_size = 72 2313 else: 2314 # 32 bits pointers 2315 ascii_struct_size = 24 2316 compact_struct_size = 36 2317 2318 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): 2319 code = ord(char) 2320 if code < 0x100: 2321 char_size = 1 # sizeof(Py_UCS1) 2322 struct_size = ascii_struct_size 2323 elif code < 0x10000: 2324 char_size = 2 # sizeof(Py_UCS2) 2325 struct_size = compact_struct_size 2326 else: 2327 char_size = 4 # sizeof(Py_UCS4) 2328 struct_size = compact_struct_size 2329 # Note: sys.maxsize is half of the actual max allocation because of 2330 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle 2331 # be allocatable, given enough memory. 2332 maxlen = ((sys.maxsize - struct_size) // char_size) 2333 alloc = lambda: char * maxlen 2334 self.assertRaises(MemoryError, alloc) 2335 self.assertRaises(MemoryError, alloc) 2336 2337 def test_format_subclass(self): 2338 class S(str): 2339 def __str__(self): 2340 return '__str__ overridden' 2341 s = S('xxx') 2342 self.assertEqual("%s" % s, '__str__ overridden') 2343 self.assertEqual("{}".format(s), '__str__ overridden') 2344 2345 def test_subclass_add(self): 2346 class S(str): 2347 def __add__(self, o): 2348 return "3" 2349 self.assertEqual(S("4") + S("5"), "3") 2350 class S(str): 2351 def __iadd__(self, o): 2352 return "3" 2353 s = S("1") 2354 s += "4" 2355 self.assertEqual(s, "3") 2356 2357 def test_getnewargs(self): 2358 text = 'abc' 2359 args = text.__getnewargs__() 2360 self.assertIsNot(args[0], text) 2361 self.assertEqual(args[0], text) 2362 self.assertEqual(len(args), 1) 2363 2364 @support.cpython_only 2365 def test_resize(self): 2366 from _testcapi import getargs_u 2367 for length in range(1, 100, 7): 2368 # generate a fresh string (refcount=1) 2369 text = 'a' * length + 'b' 2370 2371 # fill wstr internal field 2372 abc = getargs_u(text) 2373 self.assertEqual(abc, text) 2374 2375 # resize text: wstr field must be cleared and then recomputed 2376 text += 'c' 2377 abcdef = getargs_u(text) 2378 self.assertNotEqual(abc, abcdef) 2379 self.assertEqual(abcdef, text) 2380 2381 def test_compare(self): 2382 # Issue #17615 2383 N = 10 2384 ascii = 'a' * N 2385 ascii2 = 'z' * N 2386 latin = '\x80' * N 2387 latin2 = '\xff' * N 2388 bmp = '\u0100' * N 2389 bmp2 = '\uffff' * N 2390 astral = '\U00100000' * N 2391 astral2 = '\U0010ffff' * N 2392 strings = ( 2393 ascii, ascii2, 2394 latin, latin2, 2395 bmp, bmp2, 2396 astral, astral2) 2397 for text1, text2 in itertools.combinations(strings, 2): 2398 equal = (text1 is text2) 2399 self.assertEqual(text1 == text2, equal) 2400 self.assertEqual(text1 != text2, not equal) 2401 2402 if equal: 2403 self.assertTrue(text1 <= text2) 2404 self.assertTrue(text1 >= text2) 2405 2406 # text1 is text2: duplicate strings to skip the "str1 == str2" 2407 # optimization in unicode_compare_eq() and really compare 2408 # character per character 2409 copy1 = duplicate_string(text1) 2410 copy2 = duplicate_string(text2) 2411 self.assertIsNot(copy1, copy2) 2412 2413 self.assertTrue(copy1 == copy2) 2414 self.assertFalse(copy1 != copy2) 2415 2416 self.assertTrue(copy1 <= copy2) 2417 self.assertTrue(copy2 >= copy2) 2418 2419 self.assertTrue(ascii < ascii2) 2420 self.assertTrue(ascii < latin) 2421 self.assertTrue(ascii < bmp) 2422 self.assertTrue(ascii < astral) 2423 self.assertFalse(ascii >= ascii2) 2424 self.assertFalse(ascii >= latin) 2425 self.assertFalse(ascii >= bmp) 2426 self.assertFalse(ascii >= astral) 2427 2428 self.assertFalse(latin < ascii) 2429 self.assertTrue(latin < latin2) 2430 self.assertTrue(latin < bmp) 2431 self.assertTrue(latin < astral) 2432 self.assertTrue(latin >= ascii) 2433 self.assertFalse(latin >= latin2) 2434 self.assertFalse(latin >= bmp) 2435 self.assertFalse(latin >= astral) 2436 2437 self.assertFalse(bmp < ascii) 2438 self.assertFalse(bmp < latin) 2439 self.assertTrue(bmp < bmp2) 2440 self.assertTrue(bmp < astral) 2441 self.assertTrue(bmp >= ascii) 2442 self.assertTrue(bmp >= latin) 2443 self.assertFalse(bmp >= bmp2) 2444 self.assertFalse(bmp >= astral) 2445 2446 self.assertFalse(astral < ascii) 2447 self.assertFalse(astral < latin) 2448 self.assertFalse(astral < bmp2) 2449 self.assertTrue(astral < astral2) 2450 self.assertTrue(astral >= ascii) 2451 self.assertTrue(astral >= latin) 2452 self.assertTrue(astral >= bmp2) 2453 self.assertFalse(astral >= astral2) 2454 2455 def test_free_after_iterating(self): 2456 support.check_free_after_iterating(self, iter, str) 2457 support.check_free_after_iterating(self, reversed, str) 2458 2459 def test_check_encoding_errors(self): 2460 # bpo-37388: str(bytes) and str.decode() must check encoding and errors 2461 # arguments in dev mode 2462 encodings = ('ascii', 'utf8', 'latin1') 2463 invalid = 'Boom, Shaka Laka, Boom!' 2464 code = textwrap.dedent(f''' 2465 import sys 2466 encodings = {encodings!r} 2467 2468 for data in (b'', b'short string'): 2469 try: 2470 str(data, encoding={invalid!r}) 2471 except LookupError: 2472 pass 2473 else: 2474 sys.exit(21) 2475 2476 try: 2477 str(data, errors={invalid!r}) 2478 except LookupError: 2479 pass 2480 else: 2481 sys.exit(22) 2482 2483 for encoding in encodings: 2484 try: 2485 str(data, encoding, errors={invalid!r}) 2486 except LookupError: 2487 pass 2488 else: 2489 sys.exit(22) 2490 2491 for data in ('', 'short string'): 2492 try: 2493 data.encode(encoding={invalid!r}) 2494 except LookupError: 2495 pass 2496 else: 2497 sys.exit(23) 2498 2499 try: 2500 data.encode(errors={invalid!r}) 2501 except LookupError: 2502 pass 2503 else: 2504 sys.exit(24) 2505 2506 for encoding in encodings: 2507 try: 2508 data.encode(encoding, errors={invalid!r}) 2509 except LookupError: 2510 pass 2511 else: 2512 sys.exit(24) 2513 2514 sys.exit(10) 2515 ''') 2516 proc = assert_python_failure('-X', 'dev', '-c', code) 2517 self.assertEqual(proc.rc, 10, proc) 2518 2519 2520class CAPITest(unittest.TestCase): 2521 2522 # Test PyUnicode_FromFormat() 2523 def test_from_format(self): 2524 support.import_module('ctypes') 2525 from ctypes import ( 2526 c_char_p, 2527 pythonapi, py_object, sizeof, 2528 c_int, c_long, c_longlong, c_ssize_t, 2529 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) 2530 name = "PyUnicode_FromFormat" 2531 _PyUnicode_FromFormat = getattr(pythonapi, name) 2532 _PyUnicode_FromFormat.argtypes = (c_char_p,) 2533 _PyUnicode_FromFormat.restype = py_object 2534 2535 def PyUnicode_FromFormat(format, *args): 2536 cargs = tuple( 2537 py_object(arg) if isinstance(arg, str) else arg 2538 for arg in args) 2539 return _PyUnicode_FromFormat(format, *cargs) 2540 2541 def check_format(expected, format, *args): 2542 text = PyUnicode_FromFormat(format, *args) 2543 self.assertEqual(expected, text) 2544 2545 # ascii format, non-ascii argument 2546 check_format('ascii\x7f=unicode\xe9', 2547 b'ascii\x7f=%U', 'unicode\xe9') 2548 2549 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() 2550 # raises an error 2551 self.assertRaisesRegex(ValueError, 2552 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' 2553 'string, got a non-ASCII byte: 0xe9$', 2554 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') 2555 2556 # test "%c" 2557 check_format('\uabcd', 2558 b'%c', c_int(0xabcd)) 2559 check_format('\U0010ffff', 2560 b'%c', c_int(0x10ffff)) 2561 with self.assertRaises(OverflowError): 2562 PyUnicode_FromFormat(b'%c', c_int(0x110000)) 2563 # Issue #18183 2564 check_format('\U00010000\U00100000', 2565 b'%c%c', c_int(0x10000), c_int(0x100000)) 2566 2567 # test "%" 2568 check_format('%', 2569 b'%') 2570 check_format('%', 2571 b'%%') 2572 check_format('%s', 2573 b'%%s') 2574 check_format('[%]', 2575 b'[%%]') 2576 check_format('%abc', 2577 b'%%%s', b'abc') 2578 2579 # truncated string 2580 check_format('abc', 2581 b'%.3s', b'abcdef') 2582 check_format('abc[\ufffd', 2583 b'%.5s', 'abc[\u20ac]'.encode('utf8')) 2584 check_format("'\\u20acABC'", 2585 b'%A', '\u20acABC') 2586 check_format("'\\u20", 2587 b'%.5A', '\u20acABCDEF') 2588 check_format("'\u20acABC'", 2589 b'%R', '\u20acABC') 2590 check_format("'\u20acA", 2591 b'%.3R', '\u20acABCDEF') 2592 check_format('\u20acAB', 2593 b'%.3S', '\u20acABCDEF') 2594 check_format('\u20acAB', 2595 b'%.3U', '\u20acABCDEF') 2596 check_format('\u20acAB', 2597 b'%.3V', '\u20acABCDEF', None) 2598 check_format('abc[\ufffd', 2599 b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) 2600 2601 # following tests comes from #7330 2602 # test width modifier and precision modifier with %S 2603 check_format("repr= abc", 2604 b'repr=%5S', 'abc') 2605 check_format("repr=ab", 2606 b'repr=%.2S', 'abc') 2607 check_format("repr= ab", 2608 b'repr=%5.2S', 'abc') 2609 2610 # test width modifier and precision modifier with %R 2611 check_format("repr= 'abc'", 2612 b'repr=%8R', 'abc') 2613 check_format("repr='ab", 2614 b'repr=%.3R', 'abc') 2615 check_format("repr= 'ab", 2616 b'repr=%5.3R', 'abc') 2617 2618 # test width modifier and precision modifier with %A 2619 check_format("repr= 'abc'", 2620 b'repr=%8A', 'abc') 2621 check_format("repr='ab", 2622 b'repr=%.3A', 'abc') 2623 check_format("repr= 'ab", 2624 b'repr=%5.3A', 'abc') 2625 2626 # test width modifier and precision modifier with %s 2627 check_format("repr= abc", 2628 b'repr=%5s', b'abc') 2629 check_format("repr=ab", 2630 b'repr=%.2s', b'abc') 2631 check_format("repr= ab", 2632 b'repr=%5.2s', b'abc') 2633 2634 # test width modifier and precision modifier with %U 2635 check_format("repr= abc", 2636 b'repr=%5U', 'abc') 2637 check_format("repr=ab", 2638 b'repr=%.2U', 'abc') 2639 check_format("repr= ab", 2640 b'repr=%5.2U', 'abc') 2641 2642 # test width modifier and precision modifier with %V 2643 check_format("repr= abc", 2644 b'repr=%5V', 'abc', b'123') 2645 check_format("repr=ab", 2646 b'repr=%.2V', 'abc', b'123') 2647 check_format("repr= ab", 2648 b'repr=%5.2V', 'abc', b'123') 2649 check_format("repr= 123", 2650 b'repr=%5V', None, b'123') 2651 check_format("repr=12", 2652 b'repr=%.2V', None, b'123') 2653 check_format("repr= 12", 2654 b'repr=%5.2V', None, b'123') 2655 2656 # test integer formats (%i, %d, %u) 2657 check_format('010', 2658 b'%03i', c_int(10)) 2659 check_format('0010', 2660 b'%0.4i', c_int(10)) 2661 check_format('-123', 2662 b'%i', c_int(-123)) 2663 check_format('-123', 2664 b'%li', c_long(-123)) 2665 check_format('-123', 2666 b'%lli', c_longlong(-123)) 2667 check_format('-123', 2668 b'%zi', c_ssize_t(-123)) 2669 2670 check_format('-123', 2671 b'%d', c_int(-123)) 2672 check_format('-123', 2673 b'%ld', c_long(-123)) 2674 check_format('-123', 2675 b'%lld', c_longlong(-123)) 2676 check_format('-123', 2677 b'%zd', c_ssize_t(-123)) 2678 2679 check_format('123', 2680 b'%u', c_uint(123)) 2681 check_format('123', 2682 b'%lu', c_ulong(123)) 2683 check_format('123', 2684 b'%llu', c_ulonglong(123)) 2685 check_format('123', 2686 b'%zu', c_size_t(123)) 2687 2688 # test long output 2689 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) 2690 max_longlong = -min_longlong - 1 2691 check_format(str(min_longlong), 2692 b'%lld', c_longlong(min_longlong)) 2693 check_format(str(max_longlong), 2694 b'%lld', c_longlong(max_longlong)) 2695 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 2696 check_format(str(max_ulonglong), 2697 b'%llu', c_ulonglong(max_ulonglong)) 2698 PyUnicode_FromFormat(b'%p', c_void_p(-1)) 2699 2700 # test padding (width and/or precision) 2701 check_format('123'.rjust(10, '0'), 2702 b'%010i', c_int(123)) 2703 check_format('123'.rjust(100), 2704 b'%100i', c_int(123)) 2705 check_format('123'.rjust(100, '0'), 2706 b'%.100i', c_int(123)) 2707 check_format('123'.rjust(80, '0').rjust(100), 2708 b'%100.80i', c_int(123)) 2709 2710 check_format('123'.rjust(10, '0'), 2711 b'%010u', c_uint(123)) 2712 check_format('123'.rjust(100), 2713 b'%100u', c_uint(123)) 2714 check_format('123'.rjust(100, '0'), 2715 b'%.100u', c_uint(123)) 2716 check_format('123'.rjust(80, '0').rjust(100), 2717 b'%100.80u', c_uint(123)) 2718 2719 check_format('123'.rjust(10, '0'), 2720 b'%010x', c_int(0x123)) 2721 check_format('123'.rjust(100), 2722 b'%100x', c_int(0x123)) 2723 check_format('123'.rjust(100, '0'), 2724 b'%.100x', c_int(0x123)) 2725 check_format('123'.rjust(80, '0').rjust(100), 2726 b'%100.80x', c_int(0x123)) 2727 2728 # test %A 2729 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", 2730 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') 2731 2732 # test %V 2733 check_format('repr=abc', 2734 b'repr=%V', 'abc', b'xyz') 2735 2736 # Test string decode from parameter of %s using utf-8. 2737 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of 2738 # '\u4eba\u6c11' 2739 check_format('repr=\u4eba\u6c11', 2740 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') 2741 2742 #Test replace error handler. 2743 check_format('repr=abc\ufffd', 2744 b'repr=%V', None, b'abc\xff') 2745 2746 # not supported: copy the raw format string. these tests are just here 2747 # to check for crashes and should not be considered as specifications 2748 check_format('%s', 2749 b'%1%s', b'abc') 2750 check_format('%1abc', 2751 b'%1abc') 2752 check_format('%+i', 2753 b'%+i', c_int(10)) 2754 check_format('%.%s', 2755 b'%.%s', b'abc') 2756 2757 # Issue #33817: empty strings 2758 check_format('', 2759 b'') 2760 check_format('', 2761 b'%s', b'') 2762 2763 # Test PyUnicode_AsWideChar() 2764 @support.cpython_only 2765 def test_aswidechar(self): 2766 from _testcapi import unicode_aswidechar 2767 support.import_module('ctypes') 2768 from ctypes import c_wchar, sizeof 2769 2770 wchar, size = unicode_aswidechar('abcdef', 2) 2771 self.assertEqual(size, 2) 2772 self.assertEqual(wchar, 'ab') 2773 2774 wchar, size = unicode_aswidechar('abc', 3) 2775 self.assertEqual(size, 3) 2776 self.assertEqual(wchar, 'abc') 2777 2778 wchar, size = unicode_aswidechar('abc', 4) 2779 self.assertEqual(size, 3) 2780 self.assertEqual(wchar, 'abc\0') 2781 2782 wchar, size = unicode_aswidechar('abc', 10) 2783 self.assertEqual(size, 3) 2784 self.assertEqual(wchar, 'abc\0') 2785 2786 wchar, size = unicode_aswidechar('abc\0def', 20) 2787 self.assertEqual(size, 7) 2788 self.assertEqual(wchar, 'abc\0def\0') 2789 2790 nonbmp = chr(0x10ffff) 2791 if sizeof(c_wchar) == 2: 2792 buflen = 3 2793 nchar = 2 2794 else: # sizeof(c_wchar) == 4 2795 buflen = 2 2796 nchar = 1 2797 wchar, size = unicode_aswidechar(nonbmp, buflen) 2798 self.assertEqual(size, nchar) 2799 self.assertEqual(wchar, nonbmp + '\0') 2800 2801 # Test PyUnicode_AsWideCharString() 2802 @support.cpython_only 2803 def test_aswidecharstring(self): 2804 from _testcapi import unicode_aswidecharstring 2805 support.import_module('ctypes') 2806 from ctypes import c_wchar, sizeof 2807 2808 wchar, size = unicode_aswidecharstring('abc') 2809 self.assertEqual(size, 3) 2810 self.assertEqual(wchar, 'abc\0') 2811 2812 wchar, size = unicode_aswidecharstring('abc\0def') 2813 self.assertEqual(size, 7) 2814 self.assertEqual(wchar, 'abc\0def\0') 2815 2816 nonbmp = chr(0x10ffff) 2817 if sizeof(c_wchar) == 2: 2818 nchar = 2 2819 else: # sizeof(c_wchar) == 4 2820 nchar = 1 2821 wchar, size = unicode_aswidecharstring(nonbmp) 2822 self.assertEqual(size, nchar) 2823 self.assertEqual(wchar, nonbmp + '\0') 2824 2825 # Test PyUnicode_AsUCS4() 2826 @support.cpython_only 2827 def test_asucs4(self): 2828 from _testcapi import unicode_asucs4 2829 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', 2830 'a\ud800b\udfffc', '\ud834\udd1e']: 2831 l = len(s) 2832 self.assertEqual(unicode_asucs4(s, l, True), s+'\0') 2833 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff') 2834 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff') 2835 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff') 2836 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True) 2837 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False) 2838 s = '\0'.join([s, s]) 2839 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') 2840 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') 2841 2842 # Test PyUnicode_AsUTF8() 2843 @support.cpython_only 2844 def test_asutf8(self): 2845 from _testcapi import unicode_asutf8 2846 2847 bmp = '\u0100' 2848 bmp2 = '\uffff' 2849 nonbmp = chr(0x10ffff) 2850 2851 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80') 2852 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf') 2853 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf') 2854 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc') 2855 2856 # Test PyUnicode_AsUTF8AndSize() 2857 @support.cpython_only 2858 def test_asutf8andsize(self): 2859 from _testcapi import unicode_asutf8andsize 2860 2861 bmp = '\u0100' 2862 bmp2 = '\uffff' 2863 nonbmp = chr(0x10ffff) 2864 2865 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2)) 2866 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3)) 2867 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4)) 2868 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc') 2869 2870 # Test PyUnicode_FindChar() 2871 @support.cpython_only 2872 def test_findchar(self): 2873 from _testcapi import unicode_findchar 2874 2875 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": 2876 for i, ch in enumerate(str): 2877 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) 2878 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) 2879 2880 str = "!>_<!" 2881 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1) 2882 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1) 2883 # start < end 2884 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4) 2885 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4) 2886 # start >= end 2887 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) 2888 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) 2889 # negative 2890 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) 2891 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) 2892 2893 # Test PyUnicode_CopyCharacters() 2894 @support.cpython_only 2895 def test_copycharacters(self): 2896 from _testcapi import unicode_copycharacters 2897 2898 strings = [ 2899 'abcde', '\xa1\xa2\xa3\xa4\xa5', 2900 '\u4f60\u597d\u4e16\u754c\uff01', 2901 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' 2902 ] 2903 2904 for idx, from_ in enumerate(strings): 2905 # wide -> narrow: exceed maxchar limitation 2906 for to in strings[:idx]: 2907 self.assertRaises( 2908 SystemError, 2909 unicode_copycharacters, to, 0, from_, 0, 5 2910 ) 2911 # same kind 2912 for from_start in range(5): 2913 self.assertEqual( 2914 unicode_copycharacters(from_, 0, from_, from_start, 5), 2915 (from_[from_start:from_start+5].ljust(5, '\0'), 2916 5-from_start) 2917 ) 2918 for to_start in range(5): 2919 self.assertEqual( 2920 unicode_copycharacters(from_, to_start, from_, to_start, 5), 2921 (from_[to_start:to_start+5].rjust(5, '\0'), 2922 5-to_start) 2923 ) 2924 # narrow -> wide 2925 # Tests omitted since this creates invalid strings. 2926 2927 s = strings[0] 2928 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) 2929 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) 2930 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) 2931 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) 2932 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) 2933 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) 2934 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) 2935 2936 @support.cpython_only 2937 def test_encode_decimal(self): 2938 from _testcapi import unicode_encodedecimal 2939 self.assertEqual(unicode_encodedecimal('123'), 2940 b'123') 2941 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), 2942 b'3.14') 2943 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), 2944 b' 3.14 ') 2945 self.assertRaises(UnicodeEncodeError, 2946 unicode_encodedecimal, "123\u20ac", "strict") 2947 self.assertRaisesRegex( 2948 ValueError, 2949 "^'decimal' codec can't encode character", 2950 unicode_encodedecimal, "123\u20ac", "replace") 2951 2952 @support.cpython_only 2953 def test_transform_decimal(self): 2954 from _testcapi import unicode_transformdecimaltoascii as transform_decimal 2955 self.assertEqual(transform_decimal('123'), 2956 '123') 2957 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), 2958 '3.14') 2959 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), 2960 "\N{EM SPACE}3.14\N{EN SPACE}") 2961 self.assertEqual(transform_decimal('123\u20ac'), 2962 '123\u20ac') 2963 2964 @support.cpython_only 2965 def test_pep393_utf8_caching_bug(self): 2966 # Issue #25709: Problem with string concatenation and utf-8 cache 2967 from _testcapi import getargs_s_hash 2968 for k in 0x24, 0xa4, 0x20ac, 0x1f40d: 2969 s = '' 2970 for i in range(5): 2971 # Due to CPython specific optimization the 's' string can be 2972 # resized in-place. 2973 s += chr(k) 2974 # Parsing with the "s#" format code calls indirectly 2975 # PyUnicode_AsUTF8AndSize() which creates the UTF-8 2976 # encoded string cached in the Unicode object. 2977 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 2978 # Check that the second call returns the same result 2979 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 2980 2981class StringModuleTest(unittest.TestCase): 2982 def test_formatter_parser(self): 2983 def parse(format): 2984 return list(_string.formatter_parser(format)) 2985 2986 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}") 2987 self.assertEqual(formatter, [ 2988 ('prefix ', '2', '', 's'), 2989 ('xxx', '0', '^+10.3f', None), 2990 ('', 'obj.attr', '', 's'), 2991 (' ', 'z[0]', '10', 's'), 2992 ]) 2993 2994 formatter = parse("prefix {} suffix") 2995 self.assertEqual(formatter, [ 2996 ('prefix ', '', '', None), 2997 (' suffix', None, None, None), 2998 ]) 2999 3000 formatter = parse("str") 3001 self.assertEqual(formatter, [ 3002 ('str', None, None, None), 3003 ]) 3004 3005 formatter = parse("") 3006 self.assertEqual(formatter, []) 3007 3008 formatter = parse("{0}") 3009 self.assertEqual(formatter, [ 3010 ('', '0', '', None), 3011 ]) 3012 3013 self.assertRaises(TypeError, _string.formatter_parser, 1) 3014 3015 def test_formatter_field_name_split(self): 3016 def split(name): 3017 items = list(_string.formatter_field_name_split(name)) 3018 items[1] = list(items[1]) 3019 return items 3020 self.assertEqual(split("obj"), ["obj", []]) 3021 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]]) 3022 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]]) 3023 self.assertEqual(split("obj.arg[key1][key2]"), [ 3024 "obj", 3025 [(True, 'arg'), 3026 (False, 'key1'), 3027 (False, 'key2'), 3028 ]]) 3029 self.assertRaises(TypeError, _string.formatter_field_name_split, 1) 3030 3031 3032if __name__ == "__main__": 3033 unittest.main() 3034