1""" Test script for the Unicode implementation. 2 3Written by Marc-Andre Lemburg (mal@lemburg.com). 4 5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8import _string 9import codecs 10import itertools 11import operator 12import struct 13import sys 14import unittest 15import warnings 16from test import support, string_tests 17 18# Error handling (bad decoder return) 19def search_function(encoding): 20 def decode1(input, errors="strict"): 21 return 42 # not a tuple 22 def encode1(input, errors="strict"): 23 return 42 # not a tuple 24 def encode2(input, errors="strict"): 25 return (42, 42) # no unicode 26 def decode2(input, errors="strict"): 27 return (42, 42) # no unicode 28 if encoding=="test.unicode1": 29 return (encode1, decode1, None, None) 30 elif encoding=="test.unicode2": 31 return (encode2, decode2, None, None) 32 else: 33 return None 34codecs.register(search_function) 35 36def duplicate_string(text): 37 """ 38 Try to get a fresh clone of the specified text: 39 new object with a reference count of 1. 40 41 This is a best-effort: latin1 single letters and the empty 42 string ('') are singletons and cannot be cloned. 43 """ 44 return text.encode().decode() 45 46class StrSubclass(str): 47 pass 48 49class UnicodeTest(string_tests.CommonTest, 50 string_tests.MixinStrUnicodeUserStringTest, 51 string_tests.MixinStrUnicodeTest, 52 unittest.TestCase): 53 54 type2test = str 55 56 def checkequalnofix(self, result, object, methodname, *args): 57 method = getattr(object, methodname) 58 realresult = method(*args) 59 self.assertEqual(realresult, result) 60 self.assertTrue(type(realresult) is type(result)) 61 62 # if the original is returned make sure that 63 # this doesn't happen with subclasses 64 if realresult is object: 65 class usub(str): 66 def __repr__(self): 67 return 'usub(%r)' % str.__repr__(self) 68 object = usub(object) 69 method = getattr(object, methodname) 70 realresult = method(*args) 71 self.assertEqual(realresult, result) 72 self.assertTrue(object is not realresult) 73 74 def test_literals(self): 75 self.assertEqual('\xff', '\u00ff') 76 self.assertEqual('\uffff', '\U0000ffff') 77 self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'') 78 self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'') 79 self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000) 80 # raw strings should not have unicode escapes 81 self.assertNotEqual(r"\u0020", " ") 82 83 def test_ascii(self): 84 if not sys.platform.startswith('java'): 85 # Test basic sanity of repr() 86 self.assertEqual(ascii('abc'), "'abc'") 87 self.assertEqual(ascii('ab\\c'), "'ab\\\\c'") 88 self.assertEqual(ascii('ab\\'), "'ab\\\\'") 89 self.assertEqual(ascii('\\c'), "'\\\\c'") 90 self.assertEqual(ascii('\\'), "'\\\\'") 91 self.assertEqual(ascii('\n'), "'\\n'") 92 self.assertEqual(ascii('\r'), "'\\r'") 93 self.assertEqual(ascii('\t'), "'\\t'") 94 self.assertEqual(ascii('\b'), "'\\x08'") 95 self.assertEqual(ascii("'\""), """'\\'"'""") 96 self.assertEqual(ascii("'\""), """'\\'"'""") 97 self.assertEqual(ascii("'"), '''"'"''') 98 self.assertEqual(ascii('"'), """'"'""") 99 latin1repr = ( 100 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 101 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 102 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 103 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 104 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 105 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 106 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9" 107 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7" 108 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5" 109 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3" 110 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1" 111 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef" 112 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd" 113 "\\xfe\\xff'") 114 testrepr = ascii(''.join(map(chr, range(256)))) 115 self.assertEqual(testrepr, latin1repr) 116 # Test ascii works on wide unicode escapes without overflow. 117 self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096), 118 ascii("\U00010000" * 39 + "\uffff" * 4096)) 119 120 class WrongRepr: 121 def __repr__(self): 122 return b'byte-repr' 123 self.assertRaises(TypeError, ascii, WrongRepr()) 124 125 def test_repr(self): 126 if not sys.platform.startswith('java'): 127 # Test basic sanity of repr() 128 self.assertEqual(repr('abc'), "'abc'") 129 self.assertEqual(repr('ab\\c'), "'ab\\\\c'") 130 self.assertEqual(repr('ab\\'), "'ab\\\\'") 131 self.assertEqual(repr('\\c'), "'\\\\c'") 132 self.assertEqual(repr('\\'), "'\\\\'") 133 self.assertEqual(repr('\n'), "'\\n'") 134 self.assertEqual(repr('\r'), "'\\r'") 135 self.assertEqual(repr('\t'), "'\\t'") 136 self.assertEqual(repr('\b'), "'\\x08'") 137 self.assertEqual(repr("'\""), """'\\'"'""") 138 self.assertEqual(repr("'\""), """'\\'"'""") 139 self.assertEqual(repr("'"), '''"'"''') 140 self.assertEqual(repr('"'), """'"'""") 141 latin1repr = ( 142 "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r" 143 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a" 144 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI" 145 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f" 146 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d" 147 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b" 148 "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9" 149 "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" 150 "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5" 151 "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3" 152 "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1" 153 "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" 154 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd" 155 "\xfe\xff'") 156 testrepr = repr(''.join(map(chr, range(256)))) 157 self.assertEqual(testrepr, latin1repr) 158 # Test repr works on wide unicode escapes without overflow. 159 self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096), 160 repr("\U00010000" * 39 + "\uffff" * 4096)) 161 162 class WrongRepr: 163 def __repr__(self): 164 return b'byte-repr' 165 self.assertRaises(TypeError, repr, WrongRepr()) 166 167 def test_iterators(self): 168 # Make sure unicode objects have an __iter__ method 169 it = "\u1111\u2222\u3333".__iter__() 170 self.assertEqual(next(it), "\u1111") 171 self.assertEqual(next(it), "\u2222") 172 self.assertEqual(next(it), "\u3333") 173 self.assertRaises(StopIteration, next, it) 174 175 def test_count(self): 176 string_tests.CommonTest.test_count(self) 177 # check mixed argument types 178 self.checkequalnofix(3, 'aaa', 'count', 'a') 179 self.checkequalnofix(0, 'aaa', 'count', 'b') 180 self.checkequalnofix(3, 'aaa', 'count', 'a') 181 self.checkequalnofix(0, 'aaa', 'count', 'b') 182 self.checkequalnofix(0, 'aaa', 'count', 'b') 183 self.checkequalnofix(1, 'aaa', 'count', 'a', -1) 184 self.checkequalnofix(3, 'aaa', 'count', 'a', -10) 185 self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1) 186 self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10) 187 # test mixed kinds 188 self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a') 189 self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a') 190 self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102') 191 self.checkequal(0, 'a' * 10, 'count', '\u0102') 192 self.checkequal(0, 'a' * 10, 'count', '\U00100304') 193 self.checkequal(0, '\u0102' * 10, 'count', '\U00100304') 194 self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_') 195 self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_') 196 self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_') 197 self.checkequal(0, 'a' * 10, 'count', 'a\u0102') 198 self.checkequal(0, 'a' * 10, 'count', 'a\U00100304') 199 self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304') 200 201 def test_find(self): 202 string_tests.CommonTest.test_find(self) 203 # test implementation details of the memchr fast path 204 self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102') 205 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201') 206 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120') 207 self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220') 208 self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304') 209 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204') 210 self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004') 211 # check mixed argument types 212 self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc') 213 self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1) 214 self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4) 215 216 self.assertRaises(TypeError, 'hello'.find) 217 self.assertRaises(TypeError, 'hello'.find, 42) 218 # test mixed kinds 219 self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a') 220 self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a') 221 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102') 222 self.checkequal(-1, 'a' * 100, 'find', '\u0102') 223 self.checkequal(-1, 'a' * 100, 'find', '\U00100304') 224 self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304') 225 self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_') 226 self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_') 227 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_') 228 self.checkequal(-1, 'a' * 100, 'find', 'a\u0102') 229 self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304') 230 self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304') 231 232 def test_rfind(self): 233 string_tests.CommonTest.test_rfind(self) 234 # test implementation details of the memrchr fast path 235 self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102') 236 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201') 237 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120') 238 self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220') 239 self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304') 240 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204') 241 self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004') 242 # check mixed argument types 243 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc') 244 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 245 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '') 246 # test mixed kinds 247 self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a') 248 self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a') 249 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102') 250 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102') 251 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304') 252 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304') 253 self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a') 254 self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a') 255 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102') 256 self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a') 257 self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a') 258 self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102') 259 260 def test_index(self): 261 string_tests.CommonTest.test_index(self) 262 self.checkequalnofix(0, 'abcdefghiabc', 'index', '') 263 self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def') 264 self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc') 265 self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1) 266 self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib') 267 self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1) 268 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8) 269 self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1) 270 # test mixed kinds 271 self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a') 272 self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a') 273 self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102') 274 self.assertRaises(ValueError, ('a' * 100).index, '\u0102') 275 self.assertRaises(ValueError, ('a' * 100).index, '\U00100304') 276 self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304') 277 self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_') 278 self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_') 279 self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_') 280 self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102') 281 self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304') 282 self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304') 283 284 def test_rindex(self): 285 string_tests.CommonTest.test_rindex(self) 286 self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '') 287 self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def') 288 self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc') 289 self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1) 290 291 self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib') 292 self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1) 293 self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1) 294 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8) 295 self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1) 296 # test mixed kinds 297 self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a') 298 self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a') 299 self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102') 300 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102') 301 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304') 302 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304') 303 self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a') 304 self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a') 305 self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102') 306 self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a') 307 self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a') 308 self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102') 309 310 def test_maketrans_translate(self): 311 # these work with plain translate() 312 self.checkequalnofix('bbbc', 'abababc', 'translate', 313 {ord('a'): None}) 314 self.checkequalnofix('iiic', 'abababc', 'translate', 315 {ord('a'): None, ord('b'): ord('i')}) 316 self.checkequalnofix('iiix', 'abababc', 'translate', 317 {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'}) 318 self.checkequalnofix('c', 'abababc', 'translate', 319 {ord('a'): None, ord('b'): ''}) 320 self.checkequalnofix('xyyx', 'xzx', 'translate', 321 {ord('z'): 'yy'}) 322 323 # this needs maketrans() 324 self.checkequalnofix('abababc', 'abababc', 'translate', 325 {'b': '<i>'}) 326 tbl = self.type2test.maketrans({'a': None, 'b': '<i>'}) 327 self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl) 328 # test alternative way of calling maketrans() 329 tbl = self.type2test.maketrans('abc', 'xyz', 'd') 330 self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl) 331 332 # various tests switching from ASCII to latin1 or the opposite; 333 # same length, remove a letter, or replace with a longer string. 334 self.assertEqual("[a]".translate(str.maketrans('a', 'X')), 335 "[X]") 336 self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})), 337 "[X]") 338 self.assertEqual("[a]".translate(str.maketrans({'a': None})), 339 "[]") 340 self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), 341 "[XXX]") 342 self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), 343 "[\xe9]") 344 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), 345 "x123") 346 self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), 347 "x\xe9") 348 349 # test non-ASCII (don't take the fast-path) 350 self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), 351 "[<\xe9>]") 352 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})), 353 "[a]") 354 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})), 355 "[]") 356 self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})), 357 "[123]") 358 self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})), 359 "[<\u20ac>\xe9]") 360 361 # invalid Unicode characters 362 invalid_char = 0x10ffff+1 363 for before in "a\xe9\u20ac\U0010ffff": 364 mapping = str.maketrans({before: invalid_char}) 365 text = "[%s]" % before 366 self.assertRaises(ValueError, text.translate, mapping) 367 368 # errors 369 self.assertRaises(TypeError, self.type2test.maketrans) 370 self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg') 371 self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def') 372 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2) 373 self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2) 374 self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2}) 375 self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2}) 376 377 self.assertRaises(TypeError, 'hello'.translate) 378 self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz') 379 380 def test_split(self): 381 string_tests.CommonTest.test_split(self) 382 383 # test mixed kinds 384 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 385 left *= 9 386 right *= 9 387 for delim in ('c', '\u0102', '\U00010302'): 388 self.checkequal([left + right], 389 left + right, 'split', delim) 390 self.checkequal([left, right], 391 left + delim + right, 'split', delim) 392 self.checkequal([left + right], 393 left + right, 'split', delim * 2) 394 self.checkequal([left, right], 395 left + delim * 2 + right, 'split', delim *2) 396 397 def test_rsplit(self): 398 string_tests.CommonTest.test_rsplit(self) 399 # test mixed kinds 400 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 401 left *= 9 402 right *= 9 403 for delim in ('c', '\u0102', '\U00010302'): 404 self.checkequal([left + right], 405 left + right, 'rsplit', delim) 406 self.checkequal([left, right], 407 left + delim + right, 'rsplit', delim) 408 self.checkequal([left + right], 409 left + right, 'rsplit', delim * 2) 410 self.checkequal([left, right], 411 left + delim * 2 + right, 'rsplit', delim *2) 412 413 def test_partition(self): 414 string_tests.MixinStrUnicodeUserStringTest.test_partition(self) 415 # test mixed kinds 416 self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200') 417 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 418 left *= 9 419 right *= 9 420 for delim in ('c', '\u0102', '\U00010302'): 421 self.checkequal((left + right, '', ''), 422 left + right, 'partition', delim) 423 self.checkequal((left, delim, right), 424 left + delim + right, 'partition', delim) 425 self.checkequal((left + right, '', ''), 426 left + right, 'partition', delim * 2) 427 self.checkequal((left, delim * 2, right), 428 left + delim * 2 + right, 'partition', delim * 2) 429 430 def test_rpartition(self): 431 string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self) 432 # test mixed kinds 433 self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200') 434 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 435 left *= 9 436 right *= 9 437 for delim in ('c', '\u0102', '\U00010302'): 438 self.checkequal(('', '', left + right), 439 left + right, 'rpartition', delim) 440 self.checkequal((left, delim, right), 441 left + delim + right, 'rpartition', delim) 442 self.checkequal(('', '', left + right), 443 left + right, 'rpartition', delim * 2) 444 self.checkequal((left, delim * 2, right), 445 left + delim * 2 + right, 'rpartition', delim * 2) 446 447 def test_join(self): 448 string_tests.MixinStrUnicodeUserStringTest.test_join(self) 449 450 class MyWrapper: 451 def __init__(self, sval): self.sval = sval 452 def __str__(self): return self.sval 453 454 # mixed arguments 455 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 456 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 457 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 458 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 459 self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd']) 460 self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd')) 461 self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz')) 462 self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')]) 463 self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()]) 464 self.checkraises(TypeError, ' ', 'join', [1, 2, 3]) 465 self.checkraises(TypeError, ' ', 'join', ['1', '2', 3]) 466 467 @unittest.skipIf(sys.maxsize > 2**32, 468 'needs too much memory on a 64-bit platform') 469 def test_join_overflow(self): 470 size = int(sys.maxsize**0.5) + 1 471 seq = ('A' * size,) * size 472 self.assertRaises(OverflowError, ''.join, seq) 473 474 def test_replace(self): 475 string_tests.CommonTest.test_replace(self) 476 477 # method call forwarded from str implementation because of unicode argument 478 self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1) 479 self.assertRaises(TypeError, 'replace'.replace, "r", 42) 480 # test mixed kinds 481 for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'): 482 left *= 9 483 right *= 9 484 for delim in ('c', '\u0102', '\U00010302'): 485 for repl in ('d', '\u0103', '\U00010303'): 486 self.checkequal(left + right, 487 left + right, 'replace', delim, repl) 488 self.checkequal(left + repl + right, 489 left + delim + right, 490 'replace', delim, repl) 491 self.checkequal(left + right, 492 left + right, 'replace', delim * 2, repl) 493 self.checkequal(left + repl + right, 494 left + delim * 2 + right, 495 'replace', delim * 2, repl) 496 497 @support.cpython_only 498 def test_replace_id(self): 499 pattern = 'abc' 500 text = 'abc def' 501 self.assertIs(text.replace(pattern, pattern), text) 502 503 def test_bytes_comparison(self): 504 with support.check_warnings(): 505 warnings.simplefilter('ignore', BytesWarning) 506 self.assertEqual('abc' == b'abc', False) 507 self.assertEqual('abc' != b'abc', True) 508 self.assertEqual('abc' == bytearray(b'abc'), False) 509 self.assertEqual('abc' != bytearray(b'abc'), True) 510 511 def test_comparison(self): 512 # Comparisons: 513 self.assertEqual('abc', 'abc') 514 self.assertTrue('abcd' > 'abc') 515 self.assertTrue('abc' < 'abcd') 516 517 if 0: 518 # Move these tests to a Unicode collation module test... 519 # Testing UTF-16 code point order comparisons... 520 521 # No surrogates, no fixup required. 522 self.assertTrue('\u0061' < '\u20ac') 523 # Non surrogate below surrogate value, no fixup required 524 self.assertTrue('\u0061' < '\ud800\udc02') 525 526 # Non surrogate above surrogate value, fixup required 527 def test_lecmp(s, s2): 528 self.assertTrue(s < s2) 529 530 def test_fixup(s): 531 s2 = '\ud800\udc01' 532 test_lecmp(s, s2) 533 s2 = '\ud900\udc01' 534 test_lecmp(s, s2) 535 s2 = '\uda00\udc01' 536 test_lecmp(s, s2) 537 s2 = '\udb00\udc01' 538 test_lecmp(s, s2) 539 s2 = '\ud800\udd01' 540 test_lecmp(s, s2) 541 s2 = '\ud900\udd01' 542 test_lecmp(s, s2) 543 s2 = '\uda00\udd01' 544 test_lecmp(s, s2) 545 s2 = '\udb00\udd01' 546 test_lecmp(s, s2) 547 s2 = '\ud800\ude01' 548 test_lecmp(s, s2) 549 s2 = '\ud900\ude01' 550 test_lecmp(s, s2) 551 s2 = '\uda00\ude01' 552 test_lecmp(s, s2) 553 s2 = '\udb00\ude01' 554 test_lecmp(s, s2) 555 s2 = '\ud800\udfff' 556 test_lecmp(s, s2) 557 s2 = '\ud900\udfff' 558 test_lecmp(s, s2) 559 s2 = '\uda00\udfff' 560 test_lecmp(s, s2) 561 s2 = '\udb00\udfff' 562 test_lecmp(s, s2) 563 564 test_fixup('\ue000') 565 test_fixup('\uff61') 566 567 # Surrogates on both sides, no fixup required 568 self.assertTrue('\ud800\udc02' < '\ud84d\udc56') 569 570 def test_islower(self): 571 super().test_islower() 572 self.checkequalnofix(False, '\u1FFc', 'islower') 573 self.assertFalse('\u2167'.islower()) 574 self.assertTrue('\u2177'.islower()) 575 # non-BMP, uppercase 576 self.assertFalse('\U00010401'.islower()) 577 self.assertFalse('\U00010427'.islower()) 578 # non-BMP, lowercase 579 self.assertTrue('\U00010429'.islower()) 580 self.assertTrue('\U0001044E'.islower()) 581 # non-BMP, non-cased 582 self.assertFalse('\U0001F40D'.islower()) 583 self.assertFalse('\U0001F46F'.islower()) 584 585 def test_isupper(self): 586 super().test_isupper() 587 if not sys.platform.startswith('java'): 588 self.checkequalnofix(False, '\u1FFc', 'isupper') 589 self.assertTrue('\u2167'.isupper()) 590 self.assertFalse('\u2177'.isupper()) 591 # non-BMP, uppercase 592 self.assertTrue('\U00010401'.isupper()) 593 self.assertTrue('\U00010427'.isupper()) 594 # non-BMP, lowercase 595 self.assertFalse('\U00010429'.isupper()) 596 self.assertFalse('\U0001044E'.isupper()) 597 # non-BMP, non-cased 598 self.assertFalse('\U0001F40D'.isupper()) 599 self.assertFalse('\U0001F46F'.isupper()) 600 601 def test_istitle(self): 602 super().test_istitle() 603 self.checkequalnofix(True, '\u1FFc', 'istitle') 604 self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle') 605 606 # non-BMP, uppercase + lowercase 607 self.assertTrue('\U00010401\U00010429'.istitle()) 608 self.assertTrue('\U00010427\U0001044E'.istitle()) 609 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 610 for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: 611 self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch)) 612 613 def test_isspace(self): 614 super().test_isspace() 615 self.checkequalnofix(True, '\u2000', 'isspace') 616 self.checkequalnofix(True, '\u200a', 'isspace') 617 self.checkequalnofix(False, '\u2014', 'isspace') 618 # apparently there are no non-BMP spaces chars in Unicode 6 619 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 620 '\U0001F40D', '\U0001F46F']: 621 self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) 622 623 def test_isalnum(self): 624 super().test_isalnum() 625 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 626 '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 627 self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch)) 628 629 def test_isalpha(self): 630 super().test_isalpha() 631 self.checkequalnofix(True, '\u1FFc', 'isalpha') 632 # non-BMP, cased 633 self.assertTrue('\U00010401'.isalpha()) 634 self.assertTrue('\U00010427'.isalpha()) 635 self.assertTrue('\U00010429'.isalpha()) 636 self.assertTrue('\U0001044E'.isalpha()) 637 # non-BMP, non-cased 638 self.assertFalse('\U0001F40D'.isalpha()) 639 self.assertFalse('\U0001F46F'.isalpha()) 640 641 def test_isascii(self): 642 super().test_isascii() 643 self.assertFalse("\u20ac".isascii()) 644 self.assertFalse("\U0010ffff".isascii()) 645 646 def test_isdecimal(self): 647 self.checkequalnofix(False, '', 'isdecimal') 648 self.checkequalnofix(False, 'a', 'isdecimal') 649 self.checkequalnofix(True, '0', 'isdecimal') 650 self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE 651 self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER 652 self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO 653 self.checkequalnofix(True, '0123456789', 'isdecimal') 654 self.checkequalnofix(False, '0123456789a', 'isdecimal') 655 656 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 657 658 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 659 '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']: 660 self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch)) 661 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']: 662 self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) 663 664 def test_isdigit(self): 665 super().test_isdigit() 666 self.checkequalnofix(True, '\u2460', 'isdigit') 667 self.checkequalnofix(False, '\xbc', 'isdigit') 668 self.checkequalnofix(True, '\u0660', 'isdigit') 669 670 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 671 '\U0001F40D', '\U0001F46F', '\U00011065']: 672 self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch)) 673 for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: 674 self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch)) 675 676 def test_isnumeric(self): 677 self.checkequalnofix(False, '', 'isnumeric') 678 self.checkequalnofix(False, 'a', 'isnumeric') 679 self.checkequalnofix(True, '0', 'isnumeric') 680 self.checkequalnofix(True, '\u2460', 'isnumeric') 681 self.checkequalnofix(True, '\xbc', 'isnumeric') 682 self.checkequalnofix(True, '\u0660', 'isnumeric') 683 self.checkequalnofix(True, '0123456789', 'isnumeric') 684 self.checkequalnofix(False, '0123456789a', 'isnumeric') 685 686 self.assertRaises(TypeError, "abc".isnumeric, 42) 687 688 for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', 689 '\U0001F40D', '\U0001F46F']: 690 self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch)) 691 for ch in ['\U00011065', '\U0001D7F6', '\U00011066', 692 '\U000104A0', '\U0001F107']: 693 self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch)) 694 695 def test_isidentifier(self): 696 self.assertTrue("a".isidentifier()) 697 self.assertTrue("Z".isidentifier()) 698 self.assertTrue("_".isidentifier()) 699 self.assertTrue("b0".isidentifier()) 700 self.assertTrue("bc".isidentifier()) 701 self.assertTrue("b_".isidentifier()) 702 self.assertTrue("µ".isidentifier()) 703 self.assertTrue("".isidentifier()) 704 705 self.assertFalse(" ".isidentifier()) 706 self.assertFalse("[".isidentifier()) 707 self.assertFalse("©".isidentifier()) 708 self.assertFalse("0".isidentifier()) 709 710 def test_isprintable(self): 711 self.assertTrue("".isprintable()) 712 self.assertTrue(" ".isprintable()) 713 self.assertTrue("abcdefg".isprintable()) 714 self.assertFalse("abcdefg\n".isprintable()) 715 # some defined Unicode character 716 self.assertTrue("\u0374".isprintable()) 717 # undefined character 718 self.assertFalse("\u0378".isprintable()) 719 # single surrogate character 720 self.assertFalse("\ud800".isprintable()) 721 722 self.assertTrue('\U0001F46F'.isprintable()) 723 self.assertFalse('\U000E0020'.isprintable()) 724 725 def test_surrogates(self): 726 for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', 727 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 728 self.assertTrue(s.islower()) 729 self.assertFalse(s.isupper()) 730 self.assertFalse(s.istitle()) 731 for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800', 732 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'): 733 self.assertFalse(s.islower()) 734 self.assertTrue(s.isupper()) 735 self.assertTrue(s.istitle()) 736 737 for meth_name in ('islower', 'isupper', 'istitle'): 738 meth = getattr(str, meth_name) 739 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'): 740 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 741 742 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', 743 'isdecimal', 'isnumeric', 744 'isidentifier', 'isprintable'): 745 meth = getattr(str, meth_name) 746 for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 747 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 748 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): 749 self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) 750 751 752 def test_lower(self): 753 string_tests.CommonTest.test_lower(self) 754 self.assertEqual('\U00010427'.lower(), '\U0001044F') 755 self.assertEqual('\U00010427\U00010427'.lower(), 756 '\U0001044F\U0001044F') 757 self.assertEqual('\U00010427\U0001044F'.lower(), 758 '\U0001044F\U0001044F') 759 self.assertEqual('X\U00010427x\U0001044F'.lower(), 760 'x\U0001044Fx\U0001044F') 761 self.assertEqual('fi'.lower(), 'fi') 762 self.assertEqual('\u0130'.lower(), '\u0069\u0307') 763 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 764 self.assertEqual('\u03a3'.lower(), '\u03c3') 765 self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3') 766 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 767 self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a') 768 self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2') 769 self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345') 770 self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ') 771 self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') 772 self.assertEqual('\u2177'.lower(), '\u2177') 773 774 def test_casefold(self): 775 self.assertEqual('hello'.casefold(), 'hello') 776 self.assertEqual('hELlo'.casefold(), 'hello') 777 self.assertEqual('ß'.casefold(), 'ss') 778 self.assertEqual('fi'.casefold(), 'fi') 779 self.assertEqual('\u03a3'.casefold(), '\u03c3') 780 self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3') 781 self.assertEqual('\u00b5'.casefold(), '\u03bc') 782 783 def test_upper(self): 784 string_tests.CommonTest.test_upper(self) 785 self.assertEqual('\U0001044F'.upper(), '\U00010427') 786 self.assertEqual('\U0001044F\U0001044F'.upper(), 787 '\U00010427\U00010427') 788 self.assertEqual('\U00010427\U0001044F'.upper(), 789 '\U00010427\U00010427') 790 self.assertEqual('X\U00010427x\U0001044F'.upper(), 791 'X\U00010427X\U00010427') 792 self.assertEqual('fi'.upper(), 'FI') 793 self.assertEqual('\u0130'.upper(), '\u0130') 794 self.assertEqual('\u03a3'.upper(), '\u03a3') 795 self.assertEqual('ß'.upper(), 'SS') 796 self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300') 797 self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') 798 self.assertEqual('\u2177'.upper(), '\u2167') 799 800 def test_capitalize(self): 801 string_tests.CommonTest.test_capitalize(self) 802 self.assertEqual('\U0001044F'.capitalize(), '\U00010427') 803 self.assertEqual('\U0001044F\U0001044F'.capitalize(), 804 '\U00010427\U0001044F') 805 self.assertEqual('\U00010427\U0001044F'.capitalize(), 806 '\U00010427\U0001044F') 807 self.assertEqual('\U0001044F\U00010427'.capitalize(), 808 '\U00010427\U0001044F') 809 self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 810 'X\U0001044Fx\U0001044F') 811 self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307') 812 exp = '\u0399\u0308\u0300\u0069\u0307' 813 self.assertEqual('\u1fd2\u0130'.capitalize(), exp) 814 self.assertEqual('finnish'.capitalize(), 'FInnish') 815 self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') 816 817 def test_title(self): 818 super().test_title() 819 self.assertEqual('\U0001044F'.title(), '\U00010427') 820 self.assertEqual('\U0001044F\U0001044F'.title(), 821 '\U00010427\U0001044F') 822 self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(), 823 '\U00010427\U0001044F \U00010427\U0001044F') 824 self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(), 825 '\U00010427\U0001044F \U00010427\U0001044F') 826 self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(), 827 '\U00010427\U0001044F \U00010427\U0001044F') 828 self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 829 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 830 self.assertEqual('fiNNISH'.title(), 'Finnish') 831 self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy') 832 self.assertEqual('A\u03a3A'.title(), 'A\u03c3a') 833 834 def test_swapcase(self): 835 string_tests.CommonTest.test_swapcase(self) 836 self.assertEqual('\U0001044F'.swapcase(), '\U00010427') 837 self.assertEqual('\U00010427'.swapcase(), '\U0001044F') 838 self.assertEqual('\U0001044F\U0001044F'.swapcase(), 839 '\U00010427\U00010427') 840 self.assertEqual('\U00010427\U0001044F'.swapcase(), 841 '\U0001044F\U00010427') 842 self.assertEqual('\U0001044F\U00010427'.swapcase(), 843 '\U00010427\U0001044F') 844 self.assertEqual('X\U00010427x\U0001044F'.swapcase(), 845 'x\U0001044FX\U00010427') 846 self.assertEqual('fi'.swapcase(), 'FI') 847 self.assertEqual('\u0130'.swapcase(), '\u0069\u0307') 848 # Special case for GREEK CAPITAL LETTER SIGMA U+03A3 849 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 850 self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3') 851 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 852 self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A') 853 self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2') 854 self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399') 855 self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ') 856 self.assertEqual('\u03a3'.swapcase(), '\u03c3') 857 self.assertEqual('ß'.swapcase(), 'SS') 858 self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300') 859 860 def test_center(self): 861 string_tests.CommonTest.test_center(self) 862 self.assertEqual('x'.center(2, '\U0010FFFF'), 863 'x\U0010FFFF') 864 self.assertEqual('x'.center(3, '\U0010FFFF'), 865 '\U0010FFFFx\U0010FFFF') 866 self.assertEqual('x'.center(4, '\U0010FFFF'), 867 '\U0010FFFFx\U0010FFFF\U0010FFFF') 868 869 @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system") 870 @support.cpython_only 871 def test_case_operation_overflow(self): 872 # Issue #22643 873 size = 2**32//12 + 1 874 try: 875 s = "ü" * size 876 except MemoryError: 877 self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20)) 878 try: 879 self.assertRaises(OverflowError, s.upper) 880 finally: 881 del s 882 883 def test_contains(self): 884 # Testing Unicode contains method 885 self.assertIn('a', 'abdb') 886 self.assertIn('a', 'bdab') 887 self.assertIn('a', 'bdaba') 888 self.assertIn('a', 'bdba') 889 self.assertNotIn('a', 'bdb') 890 self.assertIn('a', 'bdba') 891 self.assertIn('a', ('a',1,None)) 892 self.assertIn('a', (1,None,'a')) 893 self.assertIn('a', ('a',1,None)) 894 self.assertIn('a', (1,None,'a')) 895 self.assertNotIn('a', ('x',1,'y')) 896 self.assertNotIn('a', ('x',1,None)) 897 self.assertNotIn('abcd', 'abcxxxx') 898 self.assertIn('ab', 'abcd') 899 self.assertIn('ab', 'abc') 900 self.assertIn('ab', (1,None,'ab')) 901 self.assertIn('', 'abc') 902 self.assertIn('', '') 903 self.assertIn('', 'abc') 904 self.assertNotIn('\0', 'abc') 905 self.assertIn('\0', '\0abc') 906 self.assertIn('\0', 'abc\0') 907 self.assertIn('a', '\0abc') 908 self.assertIn('asdf', 'asdf') 909 self.assertNotIn('asdf', 'asd') 910 self.assertNotIn('asdf', '') 911 912 self.assertRaises(TypeError, "abc".__contains__) 913 # test mixed kinds 914 for fill in ('a', '\u0100', '\U00010300'): 915 fill *= 9 916 for delim in ('c', '\u0102', '\U00010302'): 917 self.assertNotIn(delim, fill) 918 self.assertIn(delim, fill + delim) 919 self.assertNotIn(delim * 2, fill) 920 self.assertIn(delim * 2, fill + delim * 2) 921 922 def test_issue18183(self): 923 '\U00010000\U00100000'.lower() 924 '\U00010000\U00100000'.casefold() 925 '\U00010000\U00100000'.upper() 926 '\U00010000\U00100000'.capitalize() 927 '\U00010000\U00100000'.title() 928 '\U00010000\U00100000'.swapcase() 929 '\U00100000'.center(3, '\U00010000') 930 '\U00100000'.ljust(3, '\U00010000') 931 '\U00100000'.rjust(3, '\U00010000') 932 933 def test_format(self): 934 self.assertEqual(''.format(), '') 935 self.assertEqual('a'.format(), 'a') 936 self.assertEqual('ab'.format(), 'ab') 937 self.assertEqual('a{{'.format(), 'a{') 938 self.assertEqual('a}}'.format(), 'a}') 939 self.assertEqual('{{b'.format(), '{b') 940 self.assertEqual('}}b'.format(), '}b') 941 self.assertEqual('a{{b'.format(), 'a{b') 942 943 # examples from the PEP: 944 import datetime 945 self.assertEqual("My name is {0}".format('Fred'), "My name is Fred") 946 self.assertEqual("My name is {0[name]}".format(dict(name='Fred')), 947 "My name is Fred") 948 self.assertEqual("My name is {0} :-{{}}".format('Fred'), 949 "My name is Fred :-{}") 950 951 d = datetime.date(2007, 8, 18) 952 self.assertEqual("The year is {0.year}".format(d), 953 "The year is 2007") 954 955 # classes we'll use for testing 956 class C: 957 def __init__(self, x=100): 958 self._x = x 959 def __format__(self, spec): 960 return spec 961 962 class D: 963 def __init__(self, x): 964 self.x = x 965 def __format__(self, spec): 966 return str(self.x) 967 968 # class with __str__, but no __format__ 969 class E: 970 def __init__(self, x): 971 self.x = x 972 def __str__(self): 973 return 'E(' + self.x + ')' 974 975 # class with __repr__, but no __format__ or __str__ 976 class F: 977 def __init__(self, x): 978 self.x = x 979 def __repr__(self): 980 return 'F(' + self.x + ')' 981 982 # class with __format__ that forwards to string, for some format_spec's 983 class G: 984 def __init__(self, x): 985 self.x = x 986 def __str__(self): 987 return "string is " + self.x 988 def __format__(self, format_spec): 989 if format_spec == 'd': 990 return 'G(' + self.x + ')' 991 return object.__format__(self, format_spec) 992 993 class I(datetime.date): 994 def __format__(self, format_spec): 995 return self.strftime(format_spec) 996 997 class J(int): 998 def __format__(self, format_spec): 999 return int.__format__(self * 2, format_spec) 1000 1001 class M: 1002 def __init__(self, x): 1003 self.x = x 1004 def __repr__(self): 1005 return 'M(' + self.x + ')' 1006 __str__ = None 1007 1008 class N: 1009 def __init__(self, x): 1010 self.x = x 1011 def __repr__(self): 1012 return 'N(' + self.x + ')' 1013 __format__ = None 1014 1015 self.assertEqual(''.format(), '') 1016 self.assertEqual('abc'.format(), 'abc') 1017 self.assertEqual('{0}'.format('abc'), 'abc') 1018 self.assertEqual('{0:}'.format('abc'), 'abc') 1019# self.assertEqual('{ 0 }'.format('abc'), 'abc') 1020 self.assertEqual('X{0}'.format('abc'), 'Xabc') 1021 self.assertEqual('{0}X'.format('abc'), 'abcX') 1022 self.assertEqual('X{0}Y'.format('abc'), 'XabcY') 1023 self.assertEqual('{1}'.format(1, 'abc'), 'abc') 1024 self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc') 1025 self.assertEqual('{1}X'.format(1, 'abc'), 'abcX') 1026 self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY') 1027 self.assertEqual('{0}'.format(-15), '-15') 1028 self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc') 1029 self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc') 1030 self.assertEqual('{{'.format(), '{') 1031 self.assertEqual('}}'.format(), '}') 1032 self.assertEqual('{{}}'.format(), '{}') 1033 self.assertEqual('{{x}}'.format(), '{x}') 1034 self.assertEqual('{{{0}}}'.format(123), '{123}') 1035 self.assertEqual('{{{{0}}}}'.format(), '{{0}}') 1036 self.assertEqual('}}{{'.format(), '}{') 1037 self.assertEqual('}}x{{'.format(), '}x{') 1038 1039 # weird field names 1040 self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz') 1041 self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz') 1042 self.assertEqual("{0[ ]}".format({' ':3}), '3') 1043 1044 self.assertEqual('{foo._x}'.format(foo=C(20)), '20') 1045 self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010') 1046 self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc') 1047 self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc') 1048 self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def') 1049 self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def') 1050 self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def') 1051 1052 # strings 1053 self.assertEqual('{0:.3s}'.format('abc'), 'abc') 1054 self.assertEqual('{0:.3s}'.format('ab'), 'ab') 1055 self.assertEqual('{0:.3s}'.format('abcdef'), 'abc') 1056 self.assertEqual('{0:.0s}'.format('abcdef'), '') 1057 self.assertEqual('{0:3.3s}'.format('abc'), 'abc') 1058 self.assertEqual('{0:2.3s}'.format('abc'), 'abc') 1059 self.assertEqual('{0:2.2s}'.format('abc'), 'ab') 1060 self.assertEqual('{0:3.2s}'.format('abc'), 'ab ') 1061 self.assertEqual('{0:x<0s}'.format('result'), 'result') 1062 self.assertEqual('{0:x<5s}'.format('result'), 'result') 1063 self.assertEqual('{0:x<6s}'.format('result'), 'result') 1064 self.assertEqual('{0:x<7s}'.format('result'), 'resultx') 1065 self.assertEqual('{0:x<8s}'.format('result'), 'resultxx') 1066 self.assertEqual('{0: <7s}'.format('result'), 'result ') 1067 self.assertEqual('{0:<7s}'.format('result'), 'result ') 1068 self.assertEqual('{0:>7s}'.format('result'), ' result') 1069 self.assertEqual('{0:>8s}'.format('result'), ' result') 1070 self.assertEqual('{0:^8s}'.format('result'), ' result ') 1071 self.assertEqual('{0:^9s}'.format('result'), ' result ') 1072 self.assertEqual('{0:^10s}'.format('result'), ' result ') 1073 self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999) 1074 self.assertEqual('{0:10000}'.format(''), ' ' * 10000) 1075 self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000) 1076 1077 # issue 12546: use \x00 as a fill character 1078 self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00') 1079 self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01') 1080 self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00') 1081 self.assertEqual('{0:^6s}'.format('foo'), ' foo ') 1082 1083 self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00') 1084 self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01') 1085 self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00') 1086 self.assertEqual('{0:<6}'.format(3), '3 ') 1087 1088 self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00') 1089 self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01') 1090 self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00') 1091 self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ') 1092 1093 self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00') 1094 self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01') 1095 self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00') 1096 self.assertEqual('{0:^12}'.format(3+2.0j), ' (3+2j) ') 1097 1098 # format specifiers for user defined type 1099 self.assertEqual('{0:abc}'.format(C()), 'abc') 1100 1101 # !r, !s and !a coercions 1102 self.assertEqual('{0!s}'.format('Hello'), 'Hello') 1103 self.assertEqual('{0!s:}'.format('Hello'), 'Hello') 1104 self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ') 1105 self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ') 1106 self.assertEqual('{0!r}'.format('Hello'), "'Hello'") 1107 self.assertEqual('{0!r:}'.format('Hello'), "'Hello'") 1108 self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)') 1109 self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable 1110 self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable 1111 self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)') 1112 self.assertEqual('{0!a}'.format('Hello'), "'Hello'") 1113 self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable 1114 self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable 1115 self.assertEqual('{0!a:}'.format('Hello'), "'Hello'") 1116 self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)') 1117 self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)') 1118 1119 # test fallback to object.__format__ 1120 self.assertEqual('{0}'.format({}), '{}') 1121 self.assertEqual('{0}'.format([]), '[]') 1122 self.assertEqual('{0}'.format([1]), '[1]') 1123 1124 self.assertEqual('{0:d}'.format(G('data')), 'G(data)') 1125 self.assertEqual('{0!s}'.format(G('data')), 'string is data') 1126 1127 self.assertRaises(TypeError, '{0:^10}'.format, E('data')) 1128 self.assertRaises(TypeError, '{0:^10s}'.format, E('data')) 1129 self.assertRaises(TypeError, '{0:>15s}'.format, G('data')) 1130 1131 self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007, 1132 month=8, 1133 day=27)), 1134 "date: 2007-08-27") 1135 1136 # test deriving from a builtin type and overriding __format__ 1137 self.assertEqual("{0}".format(J(10)), "20") 1138 1139 1140 # string format specifiers 1141 self.assertEqual('{0:}'.format('a'), 'a') 1142 1143 # computed format specifiers 1144 self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello') 1145 self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello') 1146 self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello') 1147 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ') 1148 self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ') 1149 1150 # test various errors 1151 self.assertRaises(ValueError, '{'.format) 1152 self.assertRaises(ValueError, '}'.format) 1153 self.assertRaises(ValueError, 'a{'.format) 1154 self.assertRaises(ValueError, 'a}'.format) 1155 self.assertRaises(ValueError, '{a'.format) 1156 self.assertRaises(ValueError, '}a'.format) 1157 self.assertRaises(IndexError, '{0}'.format) 1158 self.assertRaises(IndexError, '{1}'.format, 'abc') 1159 self.assertRaises(KeyError, '{x}'.format) 1160 self.assertRaises(ValueError, "}{".format) 1161 self.assertRaises(ValueError, "abc{0:{}".format) 1162 self.assertRaises(ValueError, "{0".format) 1163 self.assertRaises(IndexError, "{0.}".format) 1164 self.assertRaises(ValueError, "{0.}".format, 0) 1165 self.assertRaises(ValueError, "{0[}".format) 1166 self.assertRaises(ValueError, "{0[}".format, []) 1167 self.assertRaises(KeyError, "{0]}".format) 1168 self.assertRaises(ValueError, "{0.[]}".format, 0) 1169 self.assertRaises(ValueError, "{0..foo}".format, 0) 1170 self.assertRaises(ValueError, "{0[0}".format, 0) 1171 self.assertRaises(ValueError, "{0[0:foo}".format, 0) 1172 self.assertRaises(KeyError, "{c]}".format) 1173 self.assertRaises(ValueError, "{{ {{{0}}".format, 0) 1174 self.assertRaises(ValueError, "{0}}".format, 0) 1175 self.assertRaises(KeyError, "{foo}".format, bar=3) 1176 self.assertRaises(ValueError, "{0!x}".format, 3) 1177 self.assertRaises(ValueError, "{0!}".format, 0) 1178 self.assertRaises(ValueError, "{0!rs}".format, 0) 1179 self.assertRaises(ValueError, "{!}".format) 1180 self.assertRaises(IndexError, "{:}".format) 1181 self.assertRaises(IndexError, "{:s}".format) 1182 self.assertRaises(IndexError, "{}".format) 1183 big = "23098475029384702983476098230754973209482573" 1184 self.assertRaises(ValueError, ("{" + big + "}").format) 1185 self.assertRaises(ValueError, ("{[" + big + "]}").format, [0]) 1186 1187 # issue 6089 1188 self.assertRaises(ValueError, "{0[0]x}".format, [None]) 1189 self.assertRaises(ValueError, "{0[0](10)}".format, [None]) 1190 1191 # can't have a replacement on the field name portion 1192 self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4) 1193 1194 # exceed maximum recursion depth 1195 self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '') 1196 self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format, 1197 0, 1, 2, 3, 4, 5, 6, 7) 1198 1199 # string format spec errors 1200 self.assertRaises(ValueError, "{0:-s}".format, '') 1201 self.assertRaises(ValueError, format, "", "-") 1202 self.assertRaises(ValueError, "{0:=s}".format, '') 1203 1204 # Alternate formatting is not supported 1205 self.assertRaises(ValueError, format, '', '#') 1206 self.assertRaises(ValueError, format, '', '#20') 1207 1208 # Non-ASCII 1209 self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"), 1210 'ABC\u0410\u0411\u0412') 1211 self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"), 1212 'ABC') 1213 self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"), 1214 '') 1215 1216 self.assertEqual("{[{}]}".format({"{}": 5}), "5") 1217 self.assertEqual("{[{}]}".format({"{}" : "a"}), "a") 1218 self.assertEqual("{[{]}".format({"{" : "a"}), "a") 1219 self.assertEqual("{[}]}".format({"}" : "a"}), "a") 1220 self.assertEqual("{[[]}".format({"[" : "a"}), "a") 1221 self.assertEqual("{[!]}".format({"!" : "a"}), "a") 1222 self.assertRaises(ValueError, "{a{}b}".format, 42) 1223 self.assertRaises(ValueError, "{a{b}".format, 42) 1224 self.assertRaises(ValueError, "{[}".format, 42) 1225 1226 self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000") 1227 1228 # Blocking fallback 1229 m = M('data') 1230 self.assertEqual("{!r}".format(m), 'M(data)') 1231 self.assertRaises(TypeError, "{!s}".format, m) 1232 self.assertRaises(TypeError, "{}".format, m) 1233 n = N('data') 1234 self.assertEqual("{!r}".format(n), 'N(data)') 1235 self.assertEqual("{!s}".format(n), 'N(data)') 1236 self.assertRaises(TypeError, "{}".format, n) 1237 1238 def test_format_map(self): 1239 self.assertEqual(''.format_map({}), '') 1240 self.assertEqual('a'.format_map({}), 'a') 1241 self.assertEqual('ab'.format_map({}), 'ab') 1242 self.assertEqual('a{{'.format_map({}), 'a{') 1243 self.assertEqual('a}}'.format_map({}), 'a}') 1244 self.assertEqual('{{b'.format_map({}), '{b') 1245 self.assertEqual('}}b'.format_map({}), '}b') 1246 self.assertEqual('a{{b'.format_map({}), 'a{b') 1247 1248 # using mappings 1249 class Mapping(dict): 1250 def __missing__(self, key): 1251 return key 1252 self.assertEqual('{hello}'.format_map(Mapping()), 'hello') 1253 self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world') 1254 1255 class InternalMapping: 1256 def __init__(self): 1257 self.mapping = {'a': 'hello'} 1258 def __getitem__(self, key): 1259 return self.mapping[key] 1260 self.assertEqual('{a}'.format_map(InternalMapping()), 'hello') 1261 1262 1263 class C: 1264 def __init__(self, x=100): 1265 self._x = x 1266 def __format__(self, spec): 1267 return spec 1268 self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20') 1269 1270 # test various errors 1271 self.assertRaises(TypeError, ''.format_map) 1272 self.assertRaises(TypeError, 'a'.format_map) 1273 1274 self.assertRaises(ValueError, '{'.format_map, {}) 1275 self.assertRaises(ValueError, '}'.format_map, {}) 1276 self.assertRaises(ValueError, 'a{'.format_map, {}) 1277 self.assertRaises(ValueError, 'a}'.format_map, {}) 1278 self.assertRaises(ValueError, '{a'.format_map, {}) 1279 self.assertRaises(ValueError, '}a'.format_map, {}) 1280 1281 # issue #12579: can't supply positional params to format_map 1282 self.assertRaises(ValueError, '{}'.format_map, {'a' : 2}) 1283 self.assertRaises(ValueError, '{}'.format_map, 'a') 1284 self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1}) 1285 1286 class BadMapping: 1287 def __getitem__(self, key): 1288 return 1/0 1289 self.assertRaises(KeyError, '{a}'.format_map, {}) 1290 self.assertRaises(TypeError, '{a}'.format_map, []) 1291 self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping()) 1292 1293 def test_format_huge_precision(self): 1294 format_string = ".{}f".format(sys.maxsize + 1) 1295 with self.assertRaises(ValueError): 1296 result = format(2.34, format_string) 1297 1298 def test_format_huge_width(self): 1299 format_string = "{}f".format(sys.maxsize + 1) 1300 with self.assertRaises(ValueError): 1301 result = format(2.34, format_string) 1302 1303 def test_format_huge_item_number(self): 1304 format_string = "{{{}:.6f}}".format(sys.maxsize + 1) 1305 with self.assertRaises(ValueError): 1306 result = format_string.format(2.34) 1307 1308 def test_format_auto_numbering(self): 1309 class C: 1310 def __init__(self, x=100): 1311 self._x = x 1312 def __format__(self, spec): 1313 return spec 1314 1315 self.assertEqual('{}'.format(10), '10') 1316 self.assertEqual('{:5}'.format('s'), 's ') 1317 self.assertEqual('{!r}'.format('s'), "'s'") 1318 self.assertEqual('{._x}'.format(C(10)), '10') 1319 self.assertEqual('{[1]}'.format([1, 2]), '2') 1320 self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4') 1321 self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c') 1322 1323 self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b') 1324 self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b') 1325 1326 # can't mix and match numbering and auto-numbering 1327 self.assertRaises(ValueError, '{}{1}'.format, 1, 2) 1328 self.assertRaises(ValueError, '{1}{}'.format, 1, 2) 1329 self.assertRaises(ValueError, '{:{1}}'.format, 1, 2) 1330 self.assertRaises(ValueError, '{0:{}}'.format, 1, 2) 1331 1332 # can mix and match auto-numbering and named 1333 self.assertEqual('{f}{}'.format(4, f='test'), 'test4') 1334 self.assertEqual('{}{f}'.format(4, f='test'), '4test') 1335 self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3') 1336 self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g') 1337 1338 def test_formatting(self): 1339 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self) 1340 # Testing Unicode formatting strings... 1341 self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc') 1342 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00') 1343 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00') 1344 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50') 1345 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57') 1346 self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57') 1347 if not sys.platform.startswith('java'): 1348 self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'") 1349 self.assertEqual("%r" % ("\u1234",), "'\u1234'") 1350 self.assertEqual("%a" % ("\u1234",), "'\\u1234'") 1351 self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def') 1352 self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def') 1353 1354 self.assertEqual('%c' % 0x1234, '\u1234') 1355 self.assertEqual('%c' % 0x21483, '\U00021483') 1356 self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) 1357 self.assertEqual('%c' % '\U00021483', '\U00021483') 1358 self.assertRaises(TypeError, "%c".__mod__, "aa") 1359 self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3)) 1360 self.assertRaises(TypeError, "%i".__mod__, "aa") 1361 1362 # formatting jobs delegated from the string implementation: 1363 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1364 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1365 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1366 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...') 1367 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1368 self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...') 1369 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...') 1370 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...') 1371 self.assertEqual('...%s...' % "abc", '...abc...') 1372 self.assertEqual('%*s' % (5,'abc',), ' abc') 1373 self.assertEqual('%*s' % (-5,'abc',), 'abc ') 1374 self.assertEqual('%*.*s' % (5,2,'abc',), ' ab') 1375 self.assertEqual('%*.*s' % (5,3,'abc',), ' abc') 1376 self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc') 1377 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc') 1378 self.assertEqual('%c' % 'a', 'a') 1379 class Wrapper: 1380 def __str__(self): 1381 return '\u1234' 1382 self.assertEqual('%s' % Wrapper(), '\u1234') 1383 1384 # issue 3382 1385 NAN = float('nan') 1386 INF = float('inf') 1387 self.assertEqual('%f' % NAN, 'nan') 1388 self.assertEqual('%F' % NAN, 'NAN') 1389 self.assertEqual('%f' % INF, 'inf') 1390 self.assertEqual('%F' % INF, 'INF') 1391 1392 # PEP 393 1393 self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') 1394 self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') 1395 1396 #issue 19995 1397 class PseudoInt: 1398 def __init__(self, value): 1399 self.value = int(value) 1400 def __int__(self): 1401 return self.value 1402 def __index__(self): 1403 return self.value 1404 class PseudoFloat: 1405 def __init__(self, value): 1406 self.value = float(value) 1407 def __int__(self): 1408 return int(self.value) 1409 pi = PseudoFloat(3.1415) 1410 letter_m = PseudoInt(109) 1411 self.assertEqual('%x' % 42, '2a') 1412 self.assertEqual('%X' % 15, 'F') 1413 self.assertEqual('%o' % 9, '11') 1414 self.assertEqual('%c' % 109, 'm') 1415 self.assertEqual('%x' % letter_m, '6d') 1416 self.assertEqual('%X' % letter_m, '6D') 1417 self.assertEqual('%o' % letter_m, '155') 1418 self.assertEqual('%c' % letter_m, 'm') 1419 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14), 1420 self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11), 1421 self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79), 1422 self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi), 1423 self.assertRaises(TypeError, operator.mod, '%c', pi), 1424 1425 def test_formatting_with_enum(self): 1426 # issue18780 1427 import enum 1428 class Float(float, enum.Enum): 1429 PI = 3.1415926 1430 class Int(enum.IntEnum): 1431 IDES = 15 1432 class Str(str, enum.Enum): 1433 ABC = 'abc' 1434 # Testing Unicode formatting strings... 1435 self.assertEqual("%s, %s" % (Str.ABC, Str.ABC), 1436 'Str.ABC, Str.ABC') 1437 self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" % 1438 (Str.ABC, Str.ABC, 1439 Int.IDES, Int.IDES, Int.IDES, 1440 Float.PI, Float.PI), 1441 'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14') 1442 1443 # formatting jobs delegated from the string implementation: 1444 self.assertEqual('...%(foo)s...' % {'foo':Str.ABC}, 1445 '...Str.ABC...') 1446 self.assertEqual('...%(foo)s...' % {'foo':Int.IDES}, 1447 '...Int.IDES...') 1448 self.assertEqual('...%(foo)i...' % {'foo':Int.IDES}, 1449 '...15...') 1450 self.assertEqual('...%(foo)d...' % {'foo':Int.IDES}, 1451 '...15...') 1452 self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI}, 1453 '...15...') 1454 self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123}, 1455 '...3.141593...') 1456 1457 def test_formatting_huge_precision(self): 1458 format_string = "%.{}f".format(sys.maxsize + 1) 1459 with self.assertRaises(ValueError): 1460 result = format_string % 2.34 1461 1462 def test_issue28598_strsubclass_rhs(self): 1463 # A subclass of str with an __rmod__ method should be able to hook 1464 # into the % operator 1465 class SubclassedStr(str): 1466 def __rmod__(self, other): 1467 return 'Success, self.__rmod__({!r}) was called'.format(other) 1468 self.assertEqual('lhs %% %r' % SubclassedStr('rhs'), 1469 "Success, self.__rmod__('lhs %% %r') was called") 1470 1471 @support.cpython_only 1472 def test_formatting_huge_precision_c_limits(self): 1473 from _testcapi import INT_MAX 1474 format_string = "%.{}f".format(INT_MAX + 1) 1475 with self.assertRaises(ValueError): 1476 result = format_string % 2.34 1477 1478 def test_formatting_huge_width(self): 1479 format_string = "%{}f".format(sys.maxsize + 1) 1480 with self.assertRaises(ValueError): 1481 result = format_string % 2.34 1482 1483 def test_startswith_endswith_errors(self): 1484 for meth in ('foo'.startswith, 'foo'.endswith): 1485 with self.assertRaises(TypeError) as cm: 1486 meth(['f']) 1487 exc = str(cm.exception) 1488 self.assertIn('str', exc) 1489 self.assertIn('tuple', exc) 1490 1491 @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR') 1492 def test_format_float(self): 1493 # should not format with a comma, but always with C locale 1494 self.assertEqual('1.0', '%.1f' % 1.0) 1495 1496 def test_constructor(self): 1497 # unicode(obj) tests (this maps to PyObject_Unicode() at C level) 1498 1499 self.assertEqual( 1500 str('unicode remains unicode'), 1501 'unicode remains unicode' 1502 ) 1503 1504 for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): 1505 subclass = StrSubclass(text) 1506 self.assertEqual(str(subclass), text) 1507 self.assertEqual(len(subclass), len(text)) 1508 if text == 'ascii': 1509 self.assertEqual(subclass.encode('ascii'), b'ascii') 1510 self.assertEqual(subclass.encode('utf-8'), b'ascii') 1511 1512 self.assertEqual( 1513 str('strings are converted to unicode'), 1514 'strings are converted to unicode' 1515 ) 1516 1517 class StringCompat: 1518 def __init__(self, x): 1519 self.x = x 1520 def __str__(self): 1521 return self.x 1522 1523 self.assertEqual( 1524 str(StringCompat('__str__ compatible objects are recognized')), 1525 '__str__ compatible objects are recognized' 1526 ) 1527 1528 # unicode(obj) is compatible to str(): 1529 1530 o = StringCompat('unicode(obj) is compatible to str()') 1531 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1532 self.assertEqual(str(o), 'unicode(obj) is compatible to str()') 1533 1534 for obj in (123, 123.45, 123): 1535 self.assertEqual(str(obj), str(str(obj))) 1536 1537 # unicode(obj, encoding, error) tests (this maps to 1538 # PyUnicode_FromEncodedObject() at C level) 1539 1540 if not sys.platform.startswith('java'): 1541 self.assertRaises( 1542 TypeError, 1543 str, 1544 'decoding unicode is not supported', 1545 'utf-8', 1546 'strict' 1547 ) 1548 1549 self.assertEqual( 1550 str(b'strings are decoded to unicode', 'utf-8', 'strict'), 1551 'strings are decoded to unicode' 1552 ) 1553 1554 if not sys.platform.startswith('java'): 1555 self.assertEqual( 1556 str( 1557 memoryview(b'character buffers are decoded to unicode'), 1558 'utf-8', 1559 'strict' 1560 ), 1561 'character buffers are decoded to unicode' 1562 ) 1563 1564 self.assertRaises(TypeError, str, 42, 42, 42) 1565 1566 def test_constructor_keyword_args(self): 1567 """Pass various keyword argument combinations to the constructor.""" 1568 # The object argument can be passed as a keyword. 1569 self.assertEqual(str(object='foo'), 'foo') 1570 self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo') 1571 # The errors argument without encoding triggers "decode" mode. 1572 self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'" 1573 self.assertEqual(str(object=b'foo', errors='strict'), 'foo') 1574 1575 def test_constructor_defaults(self): 1576 """Check the constructor argument defaults.""" 1577 # The object argument defaults to '' or b''. 1578 self.assertEqual(str(), '') 1579 self.assertEqual(str(errors='strict'), '') 1580 utf8_cent = '¢'.encode('utf-8') 1581 # The encoding argument defaults to utf-8. 1582 self.assertEqual(str(utf8_cent, errors='strict'), '¢') 1583 # The errors argument defaults to strict. 1584 self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii') 1585 1586 def test_codecs_utf7(self): 1587 utfTests = [ 1588 ('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example 1589 ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example 1590 ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example 1591 ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example 1592 ('+', b'+-'), 1593 ('+-', b'+--'), 1594 ('+?', b'+-?'), 1595 (r'\?', b'+AFw?'), 1596 ('+?', b'+-?'), 1597 (r'\\?', b'+AFwAXA?'), 1598 (r'\\\?', b'+AFwAXABc?'), 1599 (r'++--', b'+-+---'), 1600 ('\U000abcde', b'+2m/c3g-'), # surrogate pairs 1601 ('/', b'/'), 1602 ] 1603 1604 for (x, y) in utfTests: 1605 self.assertEqual(x.encode('utf-7'), y) 1606 1607 # Unpaired surrogates are passed through 1608 self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-') 1609 self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x') 1610 self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-') 1611 self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x') 1612 self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801') 1613 self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x') 1614 self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01') 1615 self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x') 1616 1617 self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-') 1618 self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') 1619 1620 # Issue #2242: crash on some Windows/MSVC versions 1621 self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '') 1622 1623 # Direct encoded characters 1624 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" 1625 # Optional direct characters 1626 set_o = '!"#$%&*;<=>@[]^_`{|}' 1627 for c in set_d: 1628 self.assertEqual(c.encode('utf7'), c.encode('ascii')) 1629 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1630 for c in set_o: 1631 self.assertEqual(c.encode('ascii').decode('utf7'), c) 1632 1633 def test_codecs_utf8(self): 1634 self.assertEqual(''.encode('utf-8'), b'') 1635 self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') 1636 self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') 1637 self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') 1638 self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') 1639 self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') 1640 self.assertEqual(('\U00010002'*10).encode('utf-8'), 1641 b'\xf0\x90\x80\x82'*10) 1642 self.assertEqual( 1643 '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' 1644 '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' 1645 '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' 1646 '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' 1647 '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' 1648 ' Nunstuck git und'.encode('utf-8'), 1649 b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' 1650 b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' 1651 b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' 1652 b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' 1653 b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' 1654 b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' 1655 b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' 1656 b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' 1657 b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' 1658 b'\xe3\x80\x8cWenn ist das Nunstuck git und' 1659 ) 1660 1661 # UTF-8 specific decoding tests 1662 self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' ) 1663 self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' ) 1664 self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' ) 1665 1666 # Other possible utf-8 test cases: 1667 # * strict decoding testing for all of the 1668 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 1669 1670 def test_utf8_decode_valid_sequences(self): 1671 sequences = [ 1672 # single byte 1673 (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'), 1674 # 2 bytes 1675 (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'), 1676 # 3 bytes 1677 (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'), 1678 (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'), 1679 # 4 bytes 1680 (b'\xF0\x90\x80\x80', '\U00010000'), 1681 (b'\xf4\x8f\xbf\xbf', '\U0010FFFF') 1682 ] 1683 for seq, res in sequences: 1684 self.assertEqual(seq.decode('utf-8'), res) 1685 1686 1687 def test_utf8_decode_invalid_sequences(self): 1688 # continuation bytes in a sequence of 2, 3, or 4 bytes 1689 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] 1690 # start bytes of a 2-byte sequence equivalent to code points < 0x7F 1691 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] 1692 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF 1693 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] 1694 invalid_start_bytes = ( 1695 continuation_bytes + invalid_2B_seq_start_bytes + 1696 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] 1697 ) 1698 1699 for byte in invalid_start_bytes: 1700 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8') 1701 1702 for sb in invalid_2B_seq_start_bytes: 1703 for cb in continuation_bytes: 1704 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8') 1705 1706 for sb in invalid_4B_seq_start_bytes: 1707 for cb1 in continuation_bytes[:3]: 1708 for cb3 in continuation_bytes[:3]: 1709 self.assertRaises(UnicodeDecodeError, 1710 (sb+cb1+b'\x80'+cb3).decode, 'utf-8') 1711 1712 for cb in [bytes([x]) for x in range(0x80, 0xA0)]: 1713 self.assertRaises(UnicodeDecodeError, 1714 (b'\xE0'+cb+b'\x80').decode, 'utf-8') 1715 self.assertRaises(UnicodeDecodeError, 1716 (b'\xE0'+cb+b'\xBF').decode, 'utf-8') 1717 # surrogates 1718 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: 1719 self.assertRaises(UnicodeDecodeError, 1720 (b'\xED'+cb+b'\x80').decode, 'utf-8') 1721 self.assertRaises(UnicodeDecodeError, 1722 (b'\xED'+cb+b'\xBF').decode, 'utf-8') 1723 for cb in [bytes([x]) for x in range(0x80, 0x90)]: 1724 self.assertRaises(UnicodeDecodeError, 1725 (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8') 1726 self.assertRaises(UnicodeDecodeError, 1727 (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8') 1728 for cb in [bytes([x]) for x in range(0x90, 0xC0)]: 1729 self.assertRaises(UnicodeDecodeError, 1730 (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8') 1731 self.assertRaises(UnicodeDecodeError, 1732 (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8') 1733 1734 def test_issue8271(self): 1735 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, 1736 # only the start byte and the continuation byte(s) are now considered 1737 # invalid, instead of the number of bytes specified by the start byte. 1738 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, 1739 # table 3-8, Row 2) for more information about the algorithm used. 1740 FFFD = '\ufffd' 1741 sequences = [ 1742 # invalid start bytes 1743 (b'\x80', FFFD), # continuation byte 1744 (b'\x80\x80', FFFD*2), # 2 continuation bytes 1745 (b'\xc0', FFFD), 1746 (b'\xc0\xc0', FFFD*2), 1747 (b'\xc1', FFFD), 1748 (b'\xc1\xc0', FFFD*2), 1749 (b'\xc0\xc1', FFFD*2), 1750 # with start byte of a 2-byte sequence 1751 (b'\xc2', FFFD), # only the start byte 1752 (b'\xc2\xc2', FFFD*2), # 2 start bytes 1753 (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes 1754 (b'\xc2\x41', FFFD+'A'), # invalid continuation byte 1755 # with start byte of a 3-byte sequence 1756 (b'\xe1', FFFD), # only the start byte 1757 (b'\xe1\xe1', FFFD*2), # 2 start bytes 1758 (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes 1759 (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes 1760 (b'\xe1\x80', FFFD), # only 1 continuation byte 1761 (b'\xe1\x41', FFFD+'A'), # invalid continuation byte 1762 (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb 1763 (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes 1764 (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte 1765 (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid 1766 (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid 1767 # with start byte of a 4-byte sequence 1768 (b'\xf1', FFFD), # only the start byte 1769 (b'\xf1\xf1', FFFD*2), # 2 start bytes 1770 (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes 1771 (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes 1772 (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes 1773 (b'\xf1\x80', FFFD), # only 1 continuation bytes 1774 (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes 1775 (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid 1776 (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid 1777 (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid 1778 (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid 1779 (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid 1780 (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid 1781 (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid 1782 (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD), 1783 (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2), 1784 (b'\xf1\xf1\x80\x41', FFFD*2+'A'), 1785 (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2), 1786 # with invalid start byte of a 4-byte sequence (rfc2279) 1787 (b'\xf5', FFFD), # only the start byte 1788 (b'\xf5\xf5', FFFD*2), # 2 start bytes 1789 (b'\xf5\x80', FFFD*2), # only 1 continuation byte 1790 (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte 1791 (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes 1792 (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid 1793 (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD), 1794 (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'), 1795 # with invalid start byte of a 5-byte sequence (rfc2279) 1796 (b'\xf8', FFFD), # only the start byte 1797 (b'\xf8\xf8', FFFD*2), # 2 start bytes 1798 (b'\xf8\x80', FFFD*2), # only one continuation byte 1799 (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid 1800 (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes 1801 # with invalid start byte of a 6-byte sequence (rfc2279) 1802 (b'\xfc', FFFD), # only the start byte 1803 (b'\xfc\xfc', FFFD*2), # 2 start bytes 1804 (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes 1805 (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes 1806 # invalid start byte 1807 (b'\xfe', FFFD), 1808 (b'\xfe\x80\x80', FFFD*3), 1809 # other sequences 1810 (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'), 1811 (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'), 1812 (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'), 1813 (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', 1814 '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), 1815 ] 1816 for n, (seq, res) in enumerate(sequences): 1817 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict') 1818 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1819 self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b') 1820 self.assertEqual(seq.decode('utf-8', 'ignore'), 1821 res.replace('\uFFFD', '')) 1822 1823 def assertCorrectUTF8Decoding(self, seq, res, err): 1824 """ 1825 Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when 1826 'strict' is used, returns res when 'replace' is used, and that doesn't 1827 return anything when 'ignore' is used. 1828 """ 1829 with self.assertRaises(UnicodeDecodeError) as cm: 1830 seq.decode('utf-8') 1831 exc = cm.exception 1832 1833 self.assertIn(err, str(exc)) 1834 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1835 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'), 1836 'aaaa' + res + 'bbbb') 1837 res = res.replace('\ufffd', '') 1838 self.assertEqual(seq.decode('utf-8', 'ignore'), res) 1839 self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'), 1840 'aaaa' + res + 'bbbb') 1841 1842 def test_invalid_start_byte(self): 1843 """ 1844 Test that an 'invalid start byte' error is raised when the first byte 1845 is not in the ASCII range or is not a valid start byte of a 2-, 3-, or 1846 4-bytes sequence. The invalid start byte is replaced with a single 1847 U+FFFD when errors='replace'. 1848 E.g. <80> is a continuation byte and can appear only after a start byte. 1849 """ 1850 FFFD = '\ufffd' 1851 for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': 1852 self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd', 1853 'invalid start byte') 1854 1855 def test_unexpected_end_of_data(self): 1856 """ 1857 Test that an 'unexpected end of data' error is raised when the string 1858 ends after a start byte of a 2-, 3-, or 4-bytes sequence without having 1859 enough continuation bytes. The incomplete sequence is replaced with a 1860 single U+FFFD when errors='replace'. 1861 E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes 1862 sequence, but it's followed by only 2 valid continuation bytes and the 1863 last continuation bytes is missing. 1864 Note: the continuation bytes must be all valid, if one of them is 1865 invalid another error will be raised. 1866 """ 1867 sequences = [ 1868 'C2', 'DF', 1869 'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF', 1870 'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF', 1871 'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF', 1872 'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF', 1873 'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF', 1874 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' 1875 ] 1876 FFFD = '\ufffd' 1877 for seq in sequences: 1878 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd', 1879 'unexpected end of data') 1880 1881 def test_invalid_cb_for_2bytes_seq(self): 1882 """ 1883 Test that an 'invalid continuation byte' error is raised when the 1884 continuation byte of a 2-bytes sequence is invalid. The start byte 1885 is replaced by a single U+FFFD and the second byte is handled 1886 separately when errors='replace'. 1887 E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes 1888 sequence, but 41 is not a valid continuation byte because it's the 1889 ASCII letter 'A'. 1890 """ 1891 FFFD = '\ufffd' 1892 FFFDx2 = FFFD * 2 1893 sequences = [ 1894 ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'), 1895 ('C2 C0', FFFDx2), ('C2 FF', FFFDx2), 1896 ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'), 1897 ('DF C0', FFFDx2), ('DF FF', FFFDx2), 1898 ] 1899 for seq, res in sequences: 1900 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 1901 'invalid continuation byte') 1902 1903 def test_invalid_cb_for_3bytes_seq(self): 1904 """ 1905 Test that an 'invalid continuation byte' error is raised when the 1906 continuation byte(s) of a 3-bytes sequence are invalid. When 1907 errors='replace', if the first continuation byte is valid, the first 1908 two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the 1909 third byte is handled separately, otherwise only the start byte is 1910 replaced with a U+FFFD and the other continuation bytes are handled 1911 separately. 1912 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 1913 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 1914 because it's the ASCII letter 'A'. 1915 Note: when the start byte is E0 or ED, the valid ranges for the first 1916 continuation byte are limited to A0..BF and 80..9F respectively. 1917 Python 2 used to consider all the bytes in range 80..BF valid when the 1918 start byte was ED. This is fixed in Python 3. 1919 """ 1920 FFFD = '\ufffd' 1921 FFFDx2 = FFFD * 2 1922 sequences = [ 1923 ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2), 1924 ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2), 1925 ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'), 1926 ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2), 1927 ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'), 1928 ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'), 1929 ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2), 1930 ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'), 1931 ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2), 1932 ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'), 1933 ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'), 1934 ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2), 1935 ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'), 1936 ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2), 1937 ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'), 1938 ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'), 1939 ('ED 7F', FFFD+'\x7f'), 1940 ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^ 1941 ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'), 1942 ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2), 1943 ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'), 1944 ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2), 1945 ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'), 1946 ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2), 1947 ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'), 1948 ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2), 1949 ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'), 1950 ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'), 1951 ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2), 1952 ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'), 1953 ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2), 1954 ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'), 1955 ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2), 1956 ] 1957 for seq, res in sequences: 1958 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 1959 'invalid continuation byte') 1960 1961 def test_invalid_cb_for_4bytes_seq(self): 1962 """ 1963 Test that an 'invalid continuation byte' error is raised when the 1964 continuation byte(s) of a 4-bytes sequence are invalid. When 1965 errors='replace',the start byte and all the following valid 1966 continuation bytes are replaced with a single U+FFFD, and all the bytes 1967 starting from the first invalid continuation bytes (included) are 1968 handled separately. 1969 E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes 1970 sequence, 80 is a valid continuation byte, but 41 is not a valid cb 1971 because it's the ASCII letter 'A'. 1972 Note: when the start byte is E0 or ED, the valid ranges for the first 1973 continuation byte are limited to A0..BF and 80..9F respectively. 1974 However, when the start byte is ED, Python 2 considers all the bytes 1975 in range 80..BF valid. This is fixed in Python 3. 1976 """ 1977 FFFD = '\ufffd' 1978 FFFDx2 = FFFD * 2 1979 sequences = [ 1980 ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2), 1981 ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2), 1982 ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'), 1983 ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2), 1984 ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'), 1985 ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2), 1986 ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'), 1987 ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2), 1988 ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'), 1989 ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2), 1990 ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'), 1991 ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2), 1992 ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'), 1993 ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2), 1994 ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2), 1995 ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'), 1996 ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2), 1997 ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'), 1998 ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2), 1999 ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'), 2000 ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2), 2001 ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'), 2002 ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2), 2003 ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'), 2004 ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2), 2005 ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'), 2006 ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2), 2007 ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'), 2008 ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2), 2009 ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'), 2010 ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2), 2011 ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'), 2012 ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2), 2013 ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'), 2014 ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2), 2015 ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'), 2016 ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2), 2017 ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'), 2018 ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2), 2019 ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'), 2020 ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2), 2021 ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2), 2022 ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2), 2023 ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'), 2024 ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2), 2025 ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'), 2026 ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2), 2027 ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'), 2028 ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2), 2029 ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'), 2030 ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2), 2031 ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'), 2032 ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2), 2033 ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'), 2034 ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2) 2035 ] 2036 for seq, res in sequences: 2037 self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res, 2038 'invalid continuation byte') 2039 2040 def test_codecs_idna(self): 2041 # Test whether trailing dot is preserved 2042 self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.") 2043 2044 def test_codecs_errors(self): 2045 # Error handling (encoding) 2046 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii') 2047 self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict') 2048 self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x") 2049 self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x") 2050 self.assertEqual('Andr\202 x'.encode('ascii', 'replace'), 2051 'Andr\202 x'.encode('ascii', errors='replace')) 2052 self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'), 2053 'Andr\202 x'.encode(encoding='ascii', errors='ignore')) 2054 2055 # Error handling (decoding) 2056 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii') 2057 self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') 2058 self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") 2059 self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') 2060 self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x') 2061 2062 # Error handling (unknown character names) 2063 self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") 2064 2065 # Error handling (truncated escape sequence) 2066 self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape") 2067 2068 self.assertRaises(TypeError, b"hello".decode, "test.unicode1") 2069 self.assertRaises(TypeError, str, b"hello", "test.unicode2") 2070 self.assertRaises(TypeError, "hello".encode, "test.unicode1") 2071 self.assertRaises(TypeError, "hello".encode, "test.unicode2") 2072 2073 # Error handling (wrong arguments) 2074 self.assertRaises(TypeError, "hello".encode, 42, 42, 42) 2075 2076 # Error handling (lone surrogate in 2077 # _PyUnicode_TransformDecimalAndSpaceToASCII()) 2078 self.assertRaises(ValueError, int, "\ud800") 2079 self.assertRaises(ValueError, int, "\udf00") 2080 self.assertRaises(ValueError, float, "\ud800") 2081 self.assertRaises(ValueError, float, "\udf00") 2082 self.assertRaises(ValueError, complex, "\ud800") 2083 self.assertRaises(ValueError, complex, "\udf00") 2084 2085 def test_codecs(self): 2086 # Encoding 2087 self.assertEqual('hello'.encode('ascii'), b'hello') 2088 self.assertEqual('hello'.encode('utf-7'), b'hello') 2089 self.assertEqual('hello'.encode('utf-8'), b'hello') 2090 self.assertEqual('hello'.encode('utf-8'), b'hello') 2091 self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000') 2092 self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o') 2093 self.assertEqual('hello'.encode('latin-1'), b'hello') 2094 2095 # Default encoding is utf-8 2096 self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83') 2097 2098 # Roundtrip safety for BMP (just the first 1024 chars) 2099 for c in range(1024): 2100 u = chr(c) 2101 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 2102 'utf-16-be', 'raw_unicode_escape', 2103 'unicode_escape', 'unicode_internal'): 2104 with warnings.catch_warnings(): 2105 # unicode-internal has been deprecated 2106 warnings.simplefilter("ignore", DeprecationWarning) 2107 2108 self.assertEqual(str(u.encode(encoding),encoding), u) 2109 2110 # Roundtrip safety for BMP (just the first 256 chars) 2111 for c in range(256): 2112 u = chr(c) 2113 for encoding in ('latin-1',): 2114 self.assertEqual(str(u.encode(encoding),encoding), u) 2115 2116 # Roundtrip safety for BMP (just the first 128 chars) 2117 for c in range(128): 2118 u = chr(c) 2119 for encoding in ('ascii',): 2120 self.assertEqual(str(u.encode(encoding),encoding), u) 2121 2122 # Roundtrip safety for non-BMP (just a few chars) 2123 with warnings.catch_warnings(): 2124 # unicode-internal has been deprecated 2125 warnings.simplefilter("ignore", DeprecationWarning) 2126 2127 u = '\U00010001\U00020002\U00030003\U00040004\U00050005' 2128 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 2129 'raw_unicode_escape', 2130 'unicode_escape', 'unicode_internal'): 2131 self.assertEqual(str(u.encode(encoding),encoding), u) 2132 2133 # UTF-8 must be roundtrip safe for all code points 2134 # (except surrogates, which are forbidden). 2135 u = ''.join(map(chr, list(range(0, 0xd800)) + 2136 list(range(0xe000, 0x110000)))) 2137 for encoding in ('utf-8',): 2138 self.assertEqual(str(u.encode(encoding),encoding), u) 2139 2140 def test_codecs_charmap(self): 2141 # 0-127 2142 s = bytes(range(128)) 2143 for encoding in ( 2144 'cp037', 'cp1026', 'cp273', 2145 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2146 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2147 'cp863', 'cp865', 'cp866', 'cp1125', 2148 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2149 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 2150 'iso8859_7', 'iso8859_9', 2151 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1', 2152 'mac_cyrillic', 'mac_latin2', 2153 2154 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2155 'cp1256', 'cp1257', 'cp1258', 2156 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2157 2158 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2159 'cp1006', 'iso8859_8', 2160 2161 ### These have undefined mappings: 2162 #'cp424', 2163 2164 ### These fail the round-trip: 2165 #'cp875' 2166 2167 ): 2168 self.assertEqual(str(s, encoding).encode(encoding), s) 2169 2170 # 128-255 2171 s = bytes(range(128, 256)) 2172 for encoding in ( 2173 'cp037', 'cp1026', 'cp273', 2174 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 2175 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 2176 'cp863', 'cp865', 'cp866', 'cp1125', 2177 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 2178 'iso8859_2', 'iso8859_4', 'iso8859_5', 2179 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1', 2180 'mac_cyrillic', 'mac_latin2', 2181 2182 ### These have undefined mappings: 2183 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 2184 #'cp1256', 'cp1257', 'cp1258', 2185 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', 2186 #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048', 2187 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', 2188 2189 ### These fail the round-trip: 2190 #'cp1006', 'cp875', 'iso8859_8', 2191 2192 ): 2193 self.assertEqual(str(s, encoding).encode(encoding), s) 2194 2195 def test_concatenation(self): 2196 self.assertEqual(("abc" "def"), "abcdef") 2197 self.assertEqual(("abc" "def"), "abcdef") 2198 self.assertEqual(("abc" "def"), "abcdef") 2199 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2200 self.assertEqual(("abc" "def" "ghi"), "abcdefghi") 2201 2202 def test_printing(self): 2203 class BitBucket: 2204 def write(self, text): 2205 pass 2206 2207 out = BitBucket() 2208 print('abc', file=out) 2209 print('abc', 'def', file=out) 2210 print('abc', 'def', file=out) 2211 print('abc', 'def', file=out) 2212 print('abc\n', file=out) 2213 print('abc\n', end=' ', file=out) 2214 print('abc\n', end=' ', file=out) 2215 print('def\n', file=out) 2216 print('def\n', file=out) 2217 2218 def test_ucs4(self): 2219 x = '\U00100000' 2220 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") 2221 self.assertEqual(x, y) 2222 2223 y = br'\U00100000' 2224 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2225 self.assertEqual(x, y) 2226 y = br'\U00010000' 2227 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") 2228 self.assertEqual(x, y) 2229 2230 try: 2231 br'\U11111111'.decode("raw-unicode-escape") 2232 except UnicodeDecodeError as e: 2233 self.assertEqual(e.start, 0) 2234 self.assertEqual(e.end, 10) 2235 else: 2236 self.fail("Should have raised UnicodeDecodeError") 2237 2238 def test_conversion(self): 2239 # Make sure __str__() works properly 2240 class ObjectToStr: 2241 def __str__(self): 2242 return "foo" 2243 2244 class StrSubclassToStr(str): 2245 def __str__(self): 2246 return "foo" 2247 2248 class StrSubclassToStrSubclass(str): 2249 def __new__(cls, content=""): 2250 return str.__new__(cls, 2*content) 2251 def __str__(self): 2252 return self 2253 2254 self.assertEqual(str(ObjectToStr()), "foo") 2255 self.assertEqual(str(StrSubclassToStr("bar")), "foo") 2256 s = str(StrSubclassToStrSubclass("foo")) 2257 self.assertEqual(s, "foofoo") 2258 self.assertIs(type(s), StrSubclassToStrSubclass) 2259 s = StrSubclass(StrSubclassToStrSubclass("foo")) 2260 self.assertEqual(s, "foofoo") 2261 self.assertIs(type(s), StrSubclass) 2262 2263 def test_unicode_repr(self): 2264 class s1: 2265 def __repr__(self): 2266 return '\\n' 2267 2268 class s2: 2269 def __repr__(self): 2270 return '\\n' 2271 2272 self.assertEqual(repr(s1()), '\\n') 2273 self.assertEqual(repr(s2()), '\\n') 2274 2275 def test_printable_repr(self): 2276 self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable 2277 self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable 2278 2279 # This test only affects 32-bit platforms because expandtabs can only take 2280 # an int as the max value, not a 64-bit C long. If expandtabs is changed 2281 # to take a 64-bit long, this test should apply to all platforms. 2282 @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4, 2283 'only applies to 32-bit platforms') 2284 def test_expandtabs_overflows_gracefully(self): 2285 self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize) 2286 2287 @support.cpython_only 2288 def test_expandtabs_optimization(self): 2289 s = 'abc' 2290 self.assertIs(s.expandtabs(), s) 2291 2292 def test_raiseMemError(self): 2293 if struct.calcsize('P') == 8: 2294 # 64 bits pointers 2295 ascii_struct_size = 48 2296 compact_struct_size = 72 2297 else: 2298 # 32 bits pointers 2299 ascii_struct_size = 24 2300 compact_struct_size = 36 2301 2302 for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): 2303 code = ord(char) 2304 if code < 0x100: 2305 char_size = 1 # sizeof(Py_UCS1) 2306 struct_size = ascii_struct_size 2307 elif code < 0x10000: 2308 char_size = 2 # sizeof(Py_UCS2) 2309 struct_size = compact_struct_size 2310 else: 2311 char_size = 4 # sizeof(Py_UCS4) 2312 struct_size = compact_struct_size 2313 # Note: sys.maxsize is half of the actual max allocation because of 2314 # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle 2315 # be allocatable, given enough memory. 2316 maxlen = ((sys.maxsize - struct_size) // char_size) 2317 alloc = lambda: char * maxlen 2318 self.assertRaises(MemoryError, alloc) 2319 self.assertRaises(MemoryError, alloc) 2320 2321 def test_format_subclass(self): 2322 class S(str): 2323 def __str__(self): 2324 return '__str__ overridden' 2325 s = S('xxx') 2326 self.assertEqual("%s" % s, '__str__ overridden') 2327 self.assertEqual("{}".format(s), '__str__ overridden') 2328 2329 def test_subclass_add(self): 2330 class S(str): 2331 def __add__(self, o): 2332 return "3" 2333 self.assertEqual(S("4") + S("5"), "3") 2334 class S(str): 2335 def __iadd__(self, o): 2336 return "3" 2337 s = S("1") 2338 s += "4" 2339 self.assertEqual(s, "3") 2340 2341 def test_getnewargs(self): 2342 text = 'abc' 2343 args = text.__getnewargs__() 2344 self.assertIsNot(args[0], text) 2345 self.assertEqual(args[0], text) 2346 self.assertEqual(len(args), 1) 2347 2348 def test_resize(self): 2349 for length in range(1, 100, 7): 2350 # generate a fresh string (refcount=1) 2351 text = 'a' * length + 'b' 2352 2353 with support.check_warnings(('unicode_internal codec has been ' 2354 'deprecated', DeprecationWarning)): 2355 # fill wstr internal field 2356 abc = text.encode('unicode_internal') 2357 self.assertEqual(abc.decode('unicode_internal'), text) 2358 2359 # resize text: wstr field must be cleared and then recomputed 2360 text += 'c' 2361 abcdef = text.encode('unicode_internal') 2362 self.assertNotEqual(abc, abcdef) 2363 self.assertEqual(abcdef.decode('unicode_internal'), text) 2364 2365 def test_compare(self): 2366 # Issue #17615 2367 N = 10 2368 ascii = 'a' * N 2369 ascii2 = 'z' * N 2370 latin = '\x80' * N 2371 latin2 = '\xff' * N 2372 bmp = '\u0100' * N 2373 bmp2 = '\uffff' * N 2374 astral = '\U00100000' * N 2375 astral2 = '\U0010ffff' * N 2376 strings = ( 2377 ascii, ascii2, 2378 latin, latin2, 2379 bmp, bmp2, 2380 astral, astral2) 2381 for text1, text2 in itertools.combinations(strings, 2): 2382 equal = (text1 is text2) 2383 self.assertEqual(text1 == text2, equal) 2384 self.assertEqual(text1 != text2, not equal) 2385 2386 if equal: 2387 self.assertTrue(text1 <= text2) 2388 self.assertTrue(text1 >= text2) 2389 2390 # text1 is text2: duplicate strings to skip the "str1 == str2" 2391 # optimization in unicode_compare_eq() and really compare 2392 # character per character 2393 copy1 = duplicate_string(text1) 2394 copy2 = duplicate_string(text2) 2395 self.assertIsNot(copy1, copy2) 2396 2397 self.assertTrue(copy1 == copy2) 2398 self.assertFalse(copy1 != copy2) 2399 2400 self.assertTrue(copy1 <= copy2) 2401 self.assertTrue(copy2 >= copy2) 2402 2403 self.assertTrue(ascii < ascii2) 2404 self.assertTrue(ascii < latin) 2405 self.assertTrue(ascii < bmp) 2406 self.assertTrue(ascii < astral) 2407 self.assertFalse(ascii >= ascii2) 2408 self.assertFalse(ascii >= latin) 2409 self.assertFalse(ascii >= bmp) 2410 self.assertFalse(ascii >= astral) 2411 2412 self.assertFalse(latin < ascii) 2413 self.assertTrue(latin < latin2) 2414 self.assertTrue(latin < bmp) 2415 self.assertTrue(latin < astral) 2416 self.assertTrue(latin >= ascii) 2417 self.assertFalse(latin >= latin2) 2418 self.assertFalse(latin >= bmp) 2419 self.assertFalse(latin >= astral) 2420 2421 self.assertFalse(bmp < ascii) 2422 self.assertFalse(bmp < latin) 2423 self.assertTrue(bmp < bmp2) 2424 self.assertTrue(bmp < astral) 2425 self.assertTrue(bmp >= ascii) 2426 self.assertTrue(bmp >= latin) 2427 self.assertFalse(bmp >= bmp2) 2428 self.assertFalse(bmp >= astral) 2429 2430 self.assertFalse(astral < ascii) 2431 self.assertFalse(astral < latin) 2432 self.assertFalse(astral < bmp2) 2433 self.assertTrue(astral < astral2) 2434 self.assertTrue(astral >= ascii) 2435 self.assertTrue(astral >= latin) 2436 self.assertTrue(astral >= bmp2) 2437 self.assertFalse(astral >= astral2) 2438 2439 def test_free_after_iterating(self): 2440 support.check_free_after_iterating(self, iter, str) 2441 support.check_free_after_iterating(self, reversed, str) 2442 2443 2444class CAPITest(unittest.TestCase): 2445 2446 # Test PyUnicode_FromFormat() 2447 def test_from_format(self): 2448 support.import_module('ctypes') 2449 from ctypes import ( 2450 pythonapi, py_object, sizeof, 2451 c_int, c_long, c_longlong, c_ssize_t, 2452 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) 2453 name = "PyUnicode_FromFormat" 2454 _PyUnicode_FromFormat = getattr(pythonapi, name) 2455 _PyUnicode_FromFormat.restype = py_object 2456 2457 def PyUnicode_FromFormat(format, *args): 2458 cargs = tuple( 2459 py_object(arg) if isinstance(arg, str) else arg 2460 for arg in args) 2461 return _PyUnicode_FromFormat(format, *cargs) 2462 2463 def check_format(expected, format, *args): 2464 text = PyUnicode_FromFormat(format, *args) 2465 self.assertEqual(expected, text) 2466 2467 # ascii format, non-ascii argument 2468 check_format('ascii\x7f=unicode\xe9', 2469 b'ascii\x7f=%U', 'unicode\xe9') 2470 2471 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() 2472 # raises an error 2473 self.assertRaisesRegex(ValueError, 2474 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' 2475 'string, got a non-ASCII byte: 0xe9$', 2476 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') 2477 2478 # test "%c" 2479 check_format('\uabcd', 2480 b'%c', c_int(0xabcd)) 2481 check_format('\U0010ffff', 2482 b'%c', c_int(0x10ffff)) 2483 with self.assertRaises(OverflowError): 2484 PyUnicode_FromFormat(b'%c', c_int(0x110000)) 2485 # Issue #18183 2486 check_format('\U00010000\U00100000', 2487 b'%c%c', c_int(0x10000), c_int(0x100000)) 2488 2489 # test "%" 2490 check_format('%', 2491 b'%') 2492 check_format('%', 2493 b'%%') 2494 check_format('%s', 2495 b'%%s') 2496 check_format('[%]', 2497 b'[%%]') 2498 check_format('%abc', 2499 b'%%%s', b'abc') 2500 2501 # truncated string 2502 check_format('abc', 2503 b'%.3s', b'abcdef') 2504 check_format('abc[\ufffd', 2505 b'%.5s', 'abc[\u20ac]'.encode('utf8')) 2506 check_format("'\\u20acABC'", 2507 b'%A', '\u20acABC') 2508 check_format("'\\u20", 2509 b'%.5A', '\u20acABCDEF') 2510 check_format("'\u20acABC'", 2511 b'%R', '\u20acABC') 2512 check_format("'\u20acA", 2513 b'%.3R', '\u20acABCDEF') 2514 check_format('\u20acAB', 2515 b'%.3S', '\u20acABCDEF') 2516 check_format('\u20acAB', 2517 b'%.3U', '\u20acABCDEF') 2518 check_format('\u20acAB', 2519 b'%.3V', '\u20acABCDEF', None) 2520 check_format('abc[\ufffd', 2521 b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) 2522 2523 # following tests comes from #7330 2524 # test width modifier and precision modifier with %S 2525 check_format("repr= abc", 2526 b'repr=%5S', 'abc') 2527 check_format("repr=ab", 2528 b'repr=%.2S', 'abc') 2529 check_format("repr= ab", 2530 b'repr=%5.2S', 'abc') 2531 2532 # test width modifier and precision modifier with %R 2533 check_format("repr= 'abc'", 2534 b'repr=%8R', 'abc') 2535 check_format("repr='ab", 2536 b'repr=%.3R', 'abc') 2537 check_format("repr= 'ab", 2538 b'repr=%5.3R', 'abc') 2539 2540 # test width modifier and precision modifier with %A 2541 check_format("repr= 'abc'", 2542 b'repr=%8A', 'abc') 2543 check_format("repr='ab", 2544 b'repr=%.3A', 'abc') 2545 check_format("repr= 'ab", 2546 b'repr=%5.3A', 'abc') 2547 2548 # test width modifier and precision modifier with %s 2549 check_format("repr= abc", 2550 b'repr=%5s', b'abc') 2551 check_format("repr=ab", 2552 b'repr=%.2s', b'abc') 2553 check_format("repr= ab", 2554 b'repr=%5.2s', b'abc') 2555 2556 # test width modifier and precision modifier with %U 2557 check_format("repr= abc", 2558 b'repr=%5U', 'abc') 2559 check_format("repr=ab", 2560 b'repr=%.2U', 'abc') 2561 check_format("repr= ab", 2562 b'repr=%5.2U', 'abc') 2563 2564 # test width modifier and precision modifier with %V 2565 check_format("repr= abc", 2566 b'repr=%5V', 'abc', b'123') 2567 check_format("repr=ab", 2568 b'repr=%.2V', 'abc', b'123') 2569 check_format("repr= ab", 2570 b'repr=%5.2V', 'abc', b'123') 2571 check_format("repr= 123", 2572 b'repr=%5V', None, b'123') 2573 check_format("repr=12", 2574 b'repr=%.2V', None, b'123') 2575 check_format("repr= 12", 2576 b'repr=%5.2V', None, b'123') 2577 2578 # test integer formats (%i, %d, %u) 2579 check_format('010', 2580 b'%03i', c_int(10)) 2581 check_format('0010', 2582 b'%0.4i', c_int(10)) 2583 check_format('-123', 2584 b'%i', c_int(-123)) 2585 check_format('-123', 2586 b'%li', c_long(-123)) 2587 check_format('-123', 2588 b'%lli', c_longlong(-123)) 2589 check_format('-123', 2590 b'%zi', c_ssize_t(-123)) 2591 2592 check_format('-123', 2593 b'%d', c_int(-123)) 2594 check_format('-123', 2595 b'%ld', c_long(-123)) 2596 check_format('-123', 2597 b'%lld', c_longlong(-123)) 2598 check_format('-123', 2599 b'%zd', c_ssize_t(-123)) 2600 2601 check_format('123', 2602 b'%u', c_uint(123)) 2603 check_format('123', 2604 b'%lu', c_ulong(123)) 2605 check_format('123', 2606 b'%llu', c_ulonglong(123)) 2607 check_format('123', 2608 b'%zu', c_size_t(123)) 2609 2610 # test long output 2611 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) 2612 max_longlong = -min_longlong - 1 2613 check_format(str(min_longlong), 2614 b'%lld', c_longlong(min_longlong)) 2615 check_format(str(max_longlong), 2616 b'%lld', c_longlong(max_longlong)) 2617 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 2618 check_format(str(max_ulonglong), 2619 b'%llu', c_ulonglong(max_ulonglong)) 2620 PyUnicode_FromFormat(b'%p', c_void_p(-1)) 2621 2622 # test padding (width and/or precision) 2623 check_format('123'.rjust(10, '0'), 2624 b'%010i', c_int(123)) 2625 check_format('123'.rjust(100), 2626 b'%100i', c_int(123)) 2627 check_format('123'.rjust(100, '0'), 2628 b'%.100i', c_int(123)) 2629 check_format('123'.rjust(80, '0').rjust(100), 2630 b'%100.80i', c_int(123)) 2631 2632 check_format('123'.rjust(10, '0'), 2633 b'%010u', c_uint(123)) 2634 check_format('123'.rjust(100), 2635 b'%100u', c_uint(123)) 2636 check_format('123'.rjust(100, '0'), 2637 b'%.100u', c_uint(123)) 2638 check_format('123'.rjust(80, '0').rjust(100), 2639 b'%100.80u', c_uint(123)) 2640 2641 check_format('123'.rjust(10, '0'), 2642 b'%010x', c_int(0x123)) 2643 check_format('123'.rjust(100), 2644 b'%100x', c_int(0x123)) 2645 check_format('123'.rjust(100, '0'), 2646 b'%.100x', c_int(0x123)) 2647 check_format('123'.rjust(80, '0').rjust(100), 2648 b'%100.80x', c_int(0x123)) 2649 2650 # test %A 2651 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", 2652 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') 2653 2654 # test %V 2655 check_format('repr=abc', 2656 b'repr=%V', 'abc', b'xyz') 2657 2658 # Test string decode from parameter of %s using utf-8. 2659 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of 2660 # '\u4eba\u6c11' 2661 check_format('repr=\u4eba\u6c11', 2662 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') 2663 2664 #Test replace error handler. 2665 check_format('repr=abc\ufffd', 2666 b'repr=%V', None, b'abc\xff') 2667 2668 # not supported: copy the raw format string. these tests are just here 2669 # to check for crashes and should not be considered as specifications 2670 check_format('%s', 2671 b'%1%s', b'abc') 2672 check_format('%1abc', 2673 b'%1abc') 2674 check_format('%+i', 2675 b'%+i', c_int(10)) 2676 check_format('%.%s', 2677 b'%.%s', b'abc') 2678 2679 # Issue #33817: empty strings 2680 check_format('', 2681 b'') 2682 check_format('', 2683 b'%s', b'') 2684 2685 # Test PyUnicode_AsWideChar() 2686 @support.cpython_only 2687 def test_aswidechar(self): 2688 from _testcapi import unicode_aswidechar 2689 support.import_module('ctypes') 2690 from ctypes import c_wchar, sizeof 2691 2692 wchar, size = unicode_aswidechar('abcdef', 2) 2693 self.assertEqual(size, 2) 2694 self.assertEqual(wchar, 'ab') 2695 2696 wchar, size = unicode_aswidechar('abc', 3) 2697 self.assertEqual(size, 3) 2698 self.assertEqual(wchar, 'abc') 2699 2700 wchar, size = unicode_aswidechar('abc', 4) 2701 self.assertEqual(size, 3) 2702 self.assertEqual(wchar, 'abc\0') 2703 2704 wchar, size = unicode_aswidechar('abc', 10) 2705 self.assertEqual(size, 3) 2706 self.assertEqual(wchar, 'abc\0') 2707 2708 wchar, size = unicode_aswidechar('abc\0def', 20) 2709 self.assertEqual(size, 7) 2710 self.assertEqual(wchar, 'abc\0def\0') 2711 2712 nonbmp = chr(0x10ffff) 2713 if sizeof(c_wchar) == 2: 2714 buflen = 3 2715 nchar = 2 2716 else: # sizeof(c_wchar) == 4 2717 buflen = 2 2718 nchar = 1 2719 wchar, size = unicode_aswidechar(nonbmp, buflen) 2720 self.assertEqual(size, nchar) 2721 self.assertEqual(wchar, nonbmp + '\0') 2722 2723 # Test PyUnicode_AsWideCharString() 2724 @support.cpython_only 2725 def test_aswidecharstring(self): 2726 from _testcapi import unicode_aswidecharstring 2727 support.import_module('ctypes') 2728 from ctypes import c_wchar, sizeof 2729 2730 wchar, size = unicode_aswidecharstring('abc') 2731 self.assertEqual(size, 3) 2732 self.assertEqual(wchar, 'abc\0') 2733 2734 wchar, size = unicode_aswidecharstring('abc\0def') 2735 self.assertEqual(size, 7) 2736 self.assertEqual(wchar, 'abc\0def\0') 2737 2738 nonbmp = chr(0x10ffff) 2739 if sizeof(c_wchar) == 2: 2740 nchar = 2 2741 else: # sizeof(c_wchar) == 4 2742 nchar = 1 2743 wchar, size = unicode_aswidecharstring(nonbmp) 2744 self.assertEqual(size, nchar) 2745 self.assertEqual(wchar, nonbmp + '\0') 2746 2747 # Test PyUnicode_AsUCS4() 2748 @support.cpython_only 2749 def test_asucs4(self): 2750 from _testcapi import unicode_asucs4 2751 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', 2752 'a\ud800b\udfffc', '\ud834\udd1e']: 2753 l = len(s) 2754 self.assertEqual(unicode_asucs4(s, l, 1), s+'\0') 2755 self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff') 2756 self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff') 2757 self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff') 2758 self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1) 2759 self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0) 2760 s = '\0'.join([s, s]) 2761 self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0') 2762 self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff') 2763 2764 # Test PyUnicode_FindChar() 2765 @support.cpython_only 2766 def test_findchar(self): 2767 from _testcapi import unicode_findchar 2768 2769 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": 2770 for i, ch in enumerate(str): 2771 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) 2772 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) 2773 2774 str = "!>_<!" 2775 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1) 2776 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1) 2777 # start < end 2778 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4) 2779 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4) 2780 # start >= end 2781 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) 2782 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) 2783 # negative 2784 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) 2785 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) 2786 2787 # Test PyUnicode_CopyCharacters() 2788 @support.cpython_only 2789 def test_copycharacters(self): 2790 from _testcapi import unicode_copycharacters 2791 2792 strings = [ 2793 'abcde', '\xa1\xa2\xa3\xa4\xa5', 2794 '\u4f60\u597d\u4e16\u754c\uff01', 2795 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' 2796 ] 2797 2798 for idx, from_ in enumerate(strings): 2799 # wide -> narrow: exceed maxchar limitation 2800 for to in strings[:idx]: 2801 self.assertRaises( 2802 SystemError, 2803 unicode_copycharacters, to, 0, from_, 0, 5 2804 ) 2805 # same kind 2806 for from_start in range(5): 2807 self.assertEqual( 2808 unicode_copycharacters(from_, 0, from_, from_start, 5), 2809 (from_[from_start:from_start+5].ljust(5, '\0'), 2810 5-from_start) 2811 ) 2812 for to_start in range(5): 2813 self.assertEqual( 2814 unicode_copycharacters(from_, to_start, from_, to_start, 5), 2815 (from_[to_start:to_start+5].rjust(5, '\0'), 2816 5-to_start) 2817 ) 2818 # narrow -> wide 2819 # Tests omitted since this creates invalid strings. 2820 2821 s = strings[0] 2822 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) 2823 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) 2824 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) 2825 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) 2826 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) 2827 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) 2828 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) 2829 2830 @support.cpython_only 2831 def test_encode_decimal(self): 2832 from _testcapi import unicode_encodedecimal 2833 self.assertEqual(unicode_encodedecimal('123'), 2834 b'123') 2835 self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), 2836 b'3.14') 2837 self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), 2838 b' 3.14 ') 2839 self.assertRaises(UnicodeEncodeError, 2840 unicode_encodedecimal, "123\u20ac", "strict") 2841 self.assertRaisesRegex( 2842 ValueError, 2843 "^'decimal' codec can't encode character", 2844 unicode_encodedecimal, "123\u20ac", "replace") 2845 2846 @support.cpython_only 2847 def test_transform_decimal(self): 2848 from _testcapi import unicode_transformdecimaltoascii as transform_decimal 2849 self.assertEqual(transform_decimal('123'), 2850 '123') 2851 self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), 2852 '3.14') 2853 self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), 2854 "\N{EM SPACE}3.14\N{EN SPACE}") 2855 self.assertEqual(transform_decimal('123\u20ac'), 2856 '123\u20ac') 2857 2858 @support.cpython_only 2859 def test_pep393_utf8_caching_bug(self): 2860 # Issue #25709: Problem with string concatenation and utf-8 cache 2861 from _testcapi import getargs_s_hash 2862 for k in 0x24, 0xa4, 0x20ac, 0x1f40d: 2863 s = '' 2864 for i in range(5): 2865 # Due to CPython specific optimization the 's' string can be 2866 # resized in-place. 2867 s += chr(k) 2868 # Parsing with the "s#" format code calls indirectly 2869 # PyUnicode_AsUTF8AndSize() which creates the UTF-8 2870 # encoded string cached in the Unicode object. 2871 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 2872 # Check that the second call returns the same result 2873 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) 2874 2875class StringModuleTest(unittest.TestCase): 2876 def test_formatter_parser(self): 2877 def parse(format): 2878 return list(_string.formatter_parser(format)) 2879 2880 formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}") 2881 self.assertEqual(formatter, [ 2882 ('prefix ', '2', '', 's'), 2883 ('xxx', '0', '^+10.3f', None), 2884 ('', 'obj.attr', '', 's'), 2885 (' ', 'z[0]', '10', 's'), 2886 ]) 2887 2888 formatter = parse("prefix {} suffix") 2889 self.assertEqual(formatter, [ 2890 ('prefix ', '', '', None), 2891 (' suffix', None, None, None), 2892 ]) 2893 2894 formatter = parse("str") 2895 self.assertEqual(formatter, [ 2896 ('str', None, None, None), 2897 ]) 2898 2899 formatter = parse("") 2900 self.assertEqual(formatter, []) 2901 2902 formatter = parse("{0}") 2903 self.assertEqual(formatter, [ 2904 ('', '0', '', None), 2905 ]) 2906 2907 self.assertRaises(TypeError, _string.formatter_parser, 1) 2908 2909 def test_formatter_field_name_split(self): 2910 def split(name): 2911 items = list(_string.formatter_field_name_split(name)) 2912 items[1] = list(items[1]) 2913 return items 2914 self.assertEqual(split("obj"), ["obj", []]) 2915 self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]]) 2916 self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]]) 2917 self.assertEqual(split("obj.arg[key1][key2]"), [ 2918 "obj", 2919 [(True, 'arg'), 2920 (False, 'key1'), 2921 (False, 'key2'), 2922 ]]) 2923 self.assertRaises(TypeError, _string.formatter_field_name_split, 1) 2924 2925 2926if __name__ == "__main__": 2927 unittest.main() 2928