• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import sys
14import textwrap
15import unicodedata
16import unittest
17import warnings
18from test.support import import_helper
19from test.support import warnings_helper
20from test import support, string_tests
21from test.support.script_helper import assert_python_failure
22
23# Error handling (bad decoder return)
24def search_function(encoding):
25    def decode1(input, errors="strict"):
26        return 42 # not a tuple
27    def encode1(input, errors="strict"):
28        return 42 # not a tuple
29    def encode2(input, errors="strict"):
30        return (42, 42) # no unicode
31    def decode2(input, errors="strict"):
32        return (42, 42) # no unicode
33    if encoding=="test.unicode1":
34        return (encode1, decode1, None, None)
35    elif encoding=="test.unicode2":
36        return (encode2, decode2, None, None)
37    else:
38        return None
39
40def duplicate_string(text):
41    """
42    Try to get a fresh clone of the specified text:
43    new object with a reference count of 1.
44
45    This is a best-effort: latin1 single letters and the empty
46    string ('') are singletons and cannot be cloned.
47    """
48    return text.encode().decode()
49
50class StrSubclass(str):
51    pass
52
53class UnicodeTest(string_tests.CommonTest,
54        string_tests.MixinStrUnicodeUserStringTest,
55        string_tests.MixinStrUnicodeTest,
56        unittest.TestCase):
57
58    type2test = str
59
60    def setUp(self):
61        codecs.register(search_function)
62        self.addCleanup(codecs.unregister, search_function)
63
64    def checkequalnofix(self, result, object, methodname, *args):
65        method = getattr(object, methodname)
66        realresult = method(*args)
67        self.assertEqual(realresult, result)
68        self.assertTrue(type(realresult) is type(result))
69
70        # if the original is returned make sure that
71        # this doesn't happen with subclasses
72        if realresult is object:
73            class usub(str):
74                def __repr__(self):
75                    return 'usub(%r)' % str.__repr__(self)
76            object = usub(object)
77            method = getattr(object, methodname)
78            realresult = method(*args)
79            self.assertEqual(realresult, result)
80            self.assertTrue(object is not realresult)
81
82    def test_literals(self):
83        self.assertEqual('\xff', '\u00ff')
84        self.assertEqual('\uffff', '\U0000ffff')
85        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
86        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
87        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
88        # raw strings should not have unicode escapes
89        self.assertNotEqual(r"\u0020", " ")
90
91    def test_ascii(self):
92        if not sys.platform.startswith('java'):
93            # Test basic sanity of repr()
94            self.assertEqual(ascii('abc'), "'abc'")
95            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
96            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
97            self.assertEqual(ascii('\\c'), "'\\\\c'")
98            self.assertEqual(ascii('\\'), "'\\\\'")
99            self.assertEqual(ascii('\n'), "'\\n'")
100            self.assertEqual(ascii('\r'), "'\\r'")
101            self.assertEqual(ascii('\t'), "'\\t'")
102            self.assertEqual(ascii('\b'), "'\\x08'")
103            self.assertEqual(ascii("'\""), """'\\'"'""")
104            self.assertEqual(ascii("'\""), """'\\'"'""")
105            self.assertEqual(ascii("'"), '''"'"''')
106            self.assertEqual(ascii('"'), """'"'""")
107            latin1repr = (
108                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
109                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
110                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
111                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
112                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
113                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
114                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
115                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
116                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
117                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
118                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
119                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
120                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
121                "\\xfe\\xff'")
122            testrepr = ascii(''.join(map(chr, range(256))))
123            self.assertEqual(testrepr, latin1repr)
124            # Test ascii works on wide unicode escapes without overflow.
125            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
126                             ascii("\U00010000" * 39 + "\uffff" * 4096))
127
128            class WrongRepr:
129                def __repr__(self):
130                    return b'byte-repr'
131            self.assertRaises(TypeError, ascii, WrongRepr())
132
133    def test_repr(self):
134        if not sys.platform.startswith('java'):
135            # Test basic sanity of repr()
136            self.assertEqual(repr('abc'), "'abc'")
137            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
138            self.assertEqual(repr('ab\\'), "'ab\\\\'")
139            self.assertEqual(repr('\\c'), "'\\\\c'")
140            self.assertEqual(repr('\\'), "'\\\\'")
141            self.assertEqual(repr('\n'), "'\\n'")
142            self.assertEqual(repr('\r'), "'\\r'")
143            self.assertEqual(repr('\t'), "'\\t'")
144            self.assertEqual(repr('\b'), "'\\x08'")
145            self.assertEqual(repr("'\""), """'\\'"'""")
146            self.assertEqual(repr("'\""), """'\\'"'""")
147            self.assertEqual(repr("'"), '''"'"''')
148            self.assertEqual(repr('"'), """'"'""")
149            latin1repr = (
150                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
151                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
152                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
153                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
154                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
155                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
156                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
157                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
158                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
159                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
160                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
161                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
162                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
163                "\xfe\xff'")
164            testrepr = repr(''.join(map(chr, range(256))))
165            self.assertEqual(testrepr, latin1repr)
166            # Test repr works on wide unicode escapes without overflow.
167            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
168                             repr("\U00010000" * 39 + "\uffff" * 4096))
169
170            class WrongRepr:
171                def __repr__(self):
172                    return b'byte-repr'
173            self.assertRaises(TypeError, repr, WrongRepr())
174
175    def test_iterators(self):
176        # Make sure unicode objects have an __iter__ method
177        it = "\u1111\u2222\u3333".__iter__()
178        self.assertEqual(next(it), "\u1111")
179        self.assertEqual(next(it), "\u2222")
180        self.assertEqual(next(it), "\u3333")
181        self.assertRaises(StopIteration, next, it)
182
183    def test_count(self):
184        string_tests.CommonTest.test_count(self)
185        # check mixed argument types
186        self.checkequalnofix(3,  'aaa', 'count', 'a')
187        self.checkequalnofix(0,  'aaa', 'count', 'b')
188        self.checkequalnofix(3, 'aaa', 'count',  'a')
189        self.checkequalnofix(0, 'aaa', 'count',  'b')
190        self.checkequalnofix(0, 'aaa', 'count',  'b')
191        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
192        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
193        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
194        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
195        # test mixed kinds
196        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
197        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
198        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
199        self.checkequal(0, 'a' * 10, 'count', '\u0102')
200        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
201        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
202        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
203        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
204        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
205        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
206        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
207        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
208
209    def test_find(self):
210        string_tests.CommonTest.test_find(self)
211        # test implementation details of the memchr fast path
212        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
213        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
214        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
215        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
216        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
217        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
218        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
219        # check mixed argument types
220        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
221        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
222        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
223
224        self.assertRaises(TypeError, 'hello'.find)
225        self.assertRaises(TypeError, 'hello'.find, 42)
226        # test mixed kinds
227        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
228        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
229        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
230        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
231        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
232        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
233        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
234        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
235        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
236        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
237        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
238        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
239
240    def test_rfind(self):
241        string_tests.CommonTest.test_rfind(self)
242        # test implementation details of the memrchr fast path
243        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
244        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
245        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
246        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
247        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
248        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
249        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
250        # check mixed argument types
251        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
252        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
253        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
254        # test mixed kinds
255        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
256        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
257        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
258        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
259        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
260        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
261        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
262        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
263        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
264        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
265        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
266        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
267
268    def test_index(self):
269        string_tests.CommonTest.test_index(self)
270        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
271        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
272        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
273        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
274        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
275        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
276        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
277        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
278        # test mixed kinds
279        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
280        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
281        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
282        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
283        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
284        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
285        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
286        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
287        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
288        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
289        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
290        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
291
292    def test_rindex(self):
293        string_tests.CommonTest.test_rindex(self)
294        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
295        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
296        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
297        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
298
299        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
300        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
301        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
302        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
303        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
304        # test mixed kinds
305        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
306        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
307        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
308        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
309        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
310        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
311        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
312        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
313        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
314        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
315        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
316        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
317
318    def test_maketrans_translate(self):
319        # these work with plain translate()
320        self.checkequalnofix('bbbc', 'abababc', 'translate',
321                             {ord('a'): None})
322        self.checkequalnofix('iiic', 'abababc', 'translate',
323                             {ord('a'): None, ord('b'): ord('i')})
324        self.checkequalnofix('iiix', 'abababc', 'translate',
325                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
326        self.checkequalnofix('c', 'abababc', 'translate',
327                             {ord('a'): None, ord('b'): ''})
328        self.checkequalnofix('xyyx', 'xzx', 'translate',
329                             {ord('z'): 'yy'})
330
331        # this needs maketrans()
332        self.checkequalnofix('abababc', 'abababc', 'translate',
333                             {'b': '<i>'})
334        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
335        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
336        # test alternative way of calling maketrans()
337        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
338        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
339
340        # various tests switching from ASCII to latin1 or the opposite;
341        # same length, remove a letter, or replace with a longer string.
342        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
343                         "[X]")
344        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
345                         "[X]")
346        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
347                         "[]")
348        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
349                         "[XXX]")
350        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
351                         "[\xe9]")
352        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
353                         "x123")
354        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
355                         "x\xe9")
356
357        # test non-ASCII (don't take the fast-path)
358        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
359                         "[<\xe9>]")
360        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
361                         "[a]")
362        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
363                         "[]")
364        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
365                         "[123]")
366        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
367                         "[<\u20ac>\xe9]")
368
369        # invalid Unicode characters
370        invalid_char = 0x10ffff+1
371        for before in "a\xe9\u20ac\U0010ffff":
372            mapping = str.maketrans({before: invalid_char})
373            text = "[%s]" % before
374            self.assertRaises(ValueError, text.translate, mapping)
375
376        # errors
377        self.assertRaises(TypeError, self.type2test.maketrans)
378        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
379        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
380        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
381        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
382        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
383        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
384
385        self.assertRaises(TypeError, 'hello'.translate)
386        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
387
388    def test_split(self):
389        string_tests.CommonTest.test_split(self)
390
391        # test mixed kinds
392        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
393            left *= 9
394            right *= 9
395            for delim in ('c', '\u0102', '\U00010302'):
396                self.checkequal([left + right],
397                                left + right, 'split', delim)
398                self.checkequal([left, right],
399                                left + delim + right, 'split', delim)
400                self.checkequal([left + right],
401                                left + right, 'split', delim * 2)
402                self.checkequal([left, right],
403                                left + delim * 2 + right, 'split', delim *2)
404
405    def test_rsplit(self):
406        string_tests.CommonTest.test_rsplit(self)
407        # test mixed kinds
408        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
409            left *= 9
410            right *= 9
411            for delim in ('c', '\u0102', '\U00010302'):
412                self.checkequal([left + right],
413                                left + right, 'rsplit', delim)
414                self.checkequal([left, right],
415                                left + delim + right, 'rsplit', delim)
416                self.checkequal([left + right],
417                                left + right, 'rsplit', delim * 2)
418                self.checkequal([left, right],
419                                left + delim * 2 + right, 'rsplit', delim *2)
420
421    def test_partition(self):
422        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
423        # test mixed kinds
424        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
425        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
426            left *= 9
427            right *= 9
428            for delim in ('c', '\u0102', '\U00010302'):
429                self.checkequal((left + right, '', ''),
430                                left + right, 'partition', delim)
431                self.checkequal((left, delim, right),
432                                left + delim + right, 'partition', delim)
433                self.checkequal((left + right, '', ''),
434                                left + right, 'partition', delim * 2)
435                self.checkequal((left, delim * 2, right),
436                                left + delim * 2 + right, 'partition', delim * 2)
437
438    def test_rpartition(self):
439        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
440        # test mixed kinds
441        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
442        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
443            left *= 9
444            right *= 9
445            for delim in ('c', '\u0102', '\U00010302'):
446                self.checkequal(('', '', left + right),
447                                left + right, 'rpartition', delim)
448                self.checkequal((left, delim, right),
449                                left + delim + right, 'rpartition', delim)
450                self.checkequal(('', '', left + right),
451                                left + right, 'rpartition', delim * 2)
452                self.checkequal((left, delim * 2, right),
453                                left + delim * 2 + right, 'rpartition', delim * 2)
454
455    def test_join(self):
456        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
457
458        class MyWrapper:
459            def __init__(self, sval): self.sval = sval
460            def __str__(self): return self.sval
461
462        # mixed arguments
463        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
464        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
465        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
466        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
467        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
468        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
469        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
470        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
471        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
472        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
473        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
474
475    @unittest.skipIf(sys.maxsize > 2**32,
476        'needs too much memory on a 64-bit platform')
477    def test_join_overflow(self):
478        size = int(sys.maxsize**0.5) + 1
479        seq = ('A' * size,) * size
480        self.assertRaises(OverflowError, ''.join, seq)
481
482    def test_replace(self):
483        string_tests.CommonTest.test_replace(self)
484
485        # method call forwarded from str implementation because of unicode argument
486        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
487        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
488        # test mixed kinds
489        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
490            left *= 9
491            right *= 9
492            for delim in ('c', '\u0102', '\U00010302'):
493                for repl in ('d', '\u0103', '\U00010303'):
494                    self.checkequal(left + right,
495                                    left + right, 'replace', delim, repl)
496                    self.checkequal(left + repl + right,
497                                    left + delim + right,
498                                    'replace', delim, repl)
499                    self.checkequal(left + right,
500                                    left + right, 'replace', delim * 2, repl)
501                    self.checkequal(left + repl + right,
502                                    left + delim * 2 + right,
503                                    'replace', delim * 2, repl)
504
505    @support.cpython_only
506    def test_replace_id(self):
507        pattern = 'abc'
508        text = 'abc def'
509        self.assertIs(text.replace(pattern, pattern), text)
510
511    def test_repeat_id_preserving(self):
512        a = '123abc1@'
513        b = '456zyx-+'
514        self.assertEqual(id(a), id(a))
515        self.assertNotEqual(id(a), id(b))
516        self.assertNotEqual(id(a), id(a * -4))
517        self.assertNotEqual(id(a), id(a * 0))
518        self.assertEqual(id(a), id(a * 1))
519        self.assertEqual(id(a), id(1 * a))
520        self.assertNotEqual(id(a), id(a * 2))
521
522        class SubStr(str):
523            pass
524
525        s = SubStr('qwerty()')
526        self.assertEqual(id(s), id(s))
527        self.assertNotEqual(id(s), id(s * -4))
528        self.assertNotEqual(id(s), id(s * 0))
529        self.assertNotEqual(id(s), id(s * 1))
530        self.assertNotEqual(id(s), id(1 * s))
531        self.assertNotEqual(id(s), id(s * 2))
532
533    def test_bytes_comparison(self):
534        with warnings_helper.check_warnings():
535            warnings.simplefilter('ignore', BytesWarning)
536            self.assertEqual('abc' == b'abc', False)
537            self.assertEqual('abc' != b'abc', True)
538            self.assertEqual('abc' == bytearray(b'abc'), False)
539            self.assertEqual('abc' != bytearray(b'abc'), True)
540
541    def test_comparison(self):
542        # Comparisons:
543        self.assertEqual('abc', 'abc')
544        self.assertTrue('abcd' > 'abc')
545        self.assertTrue('abc' < 'abcd')
546
547        if 0:
548            # Move these tests to a Unicode collation module test...
549            # Testing UTF-16 code point order comparisons...
550
551            # No surrogates, no fixup required.
552            self.assertTrue('\u0061' < '\u20ac')
553            # Non surrogate below surrogate value, no fixup required
554            self.assertTrue('\u0061' < '\ud800\udc02')
555
556            # Non surrogate above surrogate value, fixup required
557            def test_lecmp(s, s2):
558                self.assertTrue(s < s2)
559
560            def test_fixup(s):
561                s2 = '\ud800\udc01'
562                test_lecmp(s, s2)
563                s2 = '\ud900\udc01'
564                test_lecmp(s, s2)
565                s2 = '\uda00\udc01'
566                test_lecmp(s, s2)
567                s2 = '\udb00\udc01'
568                test_lecmp(s, s2)
569                s2 = '\ud800\udd01'
570                test_lecmp(s, s2)
571                s2 = '\ud900\udd01'
572                test_lecmp(s, s2)
573                s2 = '\uda00\udd01'
574                test_lecmp(s, s2)
575                s2 = '\udb00\udd01'
576                test_lecmp(s, s2)
577                s2 = '\ud800\ude01'
578                test_lecmp(s, s2)
579                s2 = '\ud900\ude01'
580                test_lecmp(s, s2)
581                s2 = '\uda00\ude01'
582                test_lecmp(s, s2)
583                s2 = '\udb00\ude01'
584                test_lecmp(s, s2)
585                s2 = '\ud800\udfff'
586                test_lecmp(s, s2)
587                s2 = '\ud900\udfff'
588                test_lecmp(s, s2)
589                s2 = '\uda00\udfff'
590                test_lecmp(s, s2)
591                s2 = '\udb00\udfff'
592                test_lecmp(s, s2)
593
594                test_fixup('\ue000')
595                test_fixup('\uff61')
596
597        # Surrogates on both sides, no fixup required
598        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
599
600    def test_islower(self):
601        super().test_islower()
602        self.checkequalnofix(False, '\u1FFc', 'islower')
603        self.assertFalse('\u2167'.islower())
604        self.assertTrue('\u2177'.islower())
605        # non-BMP, uppercase
606        self.assertFalse('\U00010401'.islower())
607        self.assertFalse('\U00010427'.islower())
608        # non-BMP, lowercase
609        self.assertTrue('\U00010429'.islower())
610        self.assertTrue('\U0001044E'.islower())
611        # non-BMP, non-cased
612        self.assertFalse('\U0001F40D'.islower())
613        self.assertFalse('\U0001F46F'.islower())
614
615    def test_isupper(self):
616        super().test_isupper()
617        if not sys.platform.startswith('java'):
618            self.checkequalnofix(False, '\u1FFc', 'isupper')
619        self.assertTrue('\u2167'.isupper())
620        self.assertFalse('\u2177'.isupper())
621        # non-BMP, uppercase
622        self.assertTrue('\U00010401'.isupper())
623        self.assertTrue('\U00010427'.isupper())
624        # non-BMP, lowercase
625        self.assertFalse('\U00010429'.isupper())
626        self.assertFalse('\U0001044E'.isupper())
627        # non-BMP, non-cased
628        self.assertFalse('\U0001F40D'.isupper())
629        self.assertFalse('\U0001F46F'.isupper())
630
631    def test_istitle(self):
632        super().test_istitle()
633        self.checkequalnofix(True, '\u1FFc', 'istitle')
634        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
635
636        # non-BMP, uppercase + lowercase
637        self.assertTrue('\U00010401\U00010429'.istitle())
638        self.assertTrue('\U00010427\U0001044E'.istitle())
639        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
640        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
641            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
642
643    def test_isspace(self):
644        super().test_isspace()
645        self.checkequalnofix(True, '\u2000', 'isspace')
646        self.checkequalnofix(True, '\u200a', 'isspace')
647        self.checkequalnofix(False, '\u2014', 'isspace')
648        # There are no non-BMP whitespace chars as of Unicode 12.
649        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
650                   '\U0001F40D', '\U0001F46F']:
651            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
652
653    @support.requires_resource('cpu')
654    def test_isspace_invariant(self):
655        for codepoint in range(sys.maxunicode + 1):
656            char = chr(codepoint)
657            bidirectional = unicodedata.bidirectional(char)
658            category = unicodedata.category(char)
659            self.assertEqual(char.isspace(),
660                             (bidirectional in ('WS', 'B', 'S')
661                              or category == 'Zs'))
662
663    def test_isalnum(self):
664        super().test_isalnum()
665        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
666                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
667            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
668
669    def test_isalpha(self):
670        super().test_isalpha()
671        self.checkequalnofix(True, '\u1FFc', 'isalpha')
672        # non-BMP, cased
673        self.assertTrue('\U00010401'.isalpha())
674        self.assertTrue('\U00010427'.isalpha())
675        self.assertTrue('\U00010429'.isalpha())
676        self.assertTrue('\U0001044E'.isalpha())
677        # non-BMP, non-cased
678        self.assertFalse('\U0001F40D'.isalpha())
679        self.assertFalse('\U0001F46F'.isalpha())
680
681    def test_isascii(self):
682        super().test_isascii()
683        self.assertFalse("\u20ac".isascii())
684        self.assertFalse("\U0010ffff".isascii())
685
686    def test_isdecimal(self):
687        self.checkequalnofix(False, '', 'isdecimal')
688        self.checkequalnofix(False, 'a', 'isdecimal')
689        self.checkequalnofix(True, '0', 'isdecimal')
690        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
691        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
692        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
693        self.checkequalnofix(True, '0123456789', 'isdecimal')
694        self.checkequalnofix(False, '0123456789a', 'isdecimal')
695
696        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
697
698        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
699                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
700            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
701        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
702            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
703
704    def test_isdigit(self):
705        super().test_isdigit()
706        self.checkequalnofix(True, '\u2460', 'isdigit')
707        self.checkequalnofix(False, '\xbc', 'isdigit')
708        self.checkequalnofix(True, '\u0660', 'isdigit')
709
710        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
711                   '\U0001F40D', '\U0001F46F', '\U00011065']:
712            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
713        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
714            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
715
716    def test_isnumeric(self):
717        self.checkequalnofix(False, '', 'isnumeric')
718        self.checkequalnofix(False, 'a', 'isnumeric')
719        self.checkequalnofix(True, '0', 'isnumeric')
720        self.checkequalnofix(True, '\u2460', 'isnumeric')
721        self.checkequalnofix(True, '\xbc', 'isnumeric')
722        self.checkequalnofix(True, '\u0660', 'isnumeric')
723        self.checkequalnofix(True, '0123456789', 'isnumeric')
724        self.checkequalnofix(False, '0123456789a', 'isnumeric')
725
726        self.assertRaises(TypeError, "abc".isnumeric, 42)
727
728        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
729                   '\U0001F40D', '\U0001F46F']:
730            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
731        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
732                   '\U000104A0', '\U0001F107']:
733            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
734
735    def test_isidentifier(self):
736        self.assertTrue("a".isidentifier())
737        self.assertTrue("Z".isidentifier())
738        self.assertTrue("_".isidentifier())
739        self.assertTrue("b0".isidentifier())
740        self.assertTrue("bc".isidentifier())
741        self.assertTrue("b_".isidentifier())
742        self.assertTrue("µ".isidentifier())
743        self.assertTrue("��������������".isidentifier())
744
745        self.assertFalse(" ".isidentifier())
746        self.assertFalse("[".isidentifier())
747        self.assertFalse("©".isidentifier())
748        self.assertFalse("0".isidentifier())
749
750    @support.cpython_only
751    @support.requires_legacy_unicode_capi
752    def test_isidentifier_legacy(self):
753        import _testcapi
754        u = '��������������'
755        self.assertTrue(u.isidentifier())
756        with warnings_helper.check_warnings():
757            warnings.simplefilter('ignore', DeprecationWarning)
758            self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
759
760    def test_isprintable(self):
761        self.assertTrue("".isprintable())
762        self.assertTrue(" ".isprintable())
763        self.assertTrue("abcdefg".isprintable())
764        self.assertFalse("abcdefg\n".isprintable())
765        # some defined Unicode character
766        self.assertTrue("\u0374".isprintable())
767        # undefined character
768        self.assertFalse("\u0378".isprintable())
769        # single surrogate character
770        self.assertFalse("\ud800".isprintable())
771
772        self.assertTrue('\U0001F46F'.isprintable())
773        self.assertFalse('\U000E0020'.isprintable())
774
775    def test_surrogates(self):
776        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
777                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
778            self.assertTrue(s.islower())
779            self.assertFalse(s.isupper())
780            self.assertFalse(s.istitle())
781        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
782                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
783            self.assertFalse(s.islower())
784            self.assertTrue(s.isupper())
785            self.assertTrue(s.istitle())
786
787        for meth_name in ('islower', 'isupper', 'istitle'):
788            meth = getattr(str, meth_name)
789            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
790                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
791
792        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
793                          'isdecimal', 'isnumeric',
794                          'isidentifier', 'isprintable'):
795            meth = getattr(str, meth_name)
796            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
797                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
798                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
799                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
800
801
802    def test_lower(self):
803        string_tests.CommonTest.test_lower(self)
804        self.assertEqual('\U00010427'.lower(), '\U0001044F')
805        self.assertEqual('\U00010427\U00010427'.lower(),
806                         '\U0001044F\U0001044F')
807        self.assertEqual('\U00010427\U0001044F'.lower(),
808                         '\U0001044F\U0001044F')
809        self.assertEqual('X\U00010427x\U0001044F'.lower(),
810                         'x\U0001044Fx\U0001044F')
811        self.assertEqual('fi'.lower(), 'fi')
812        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
813        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
814        self.assertEqual('\u03a3'.lower(), '\u03c3')
815        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
816        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
817        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
818        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
819        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
820        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
821        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
822        self.assertEqual('\u2177'.lower(), '\u2177')
823
824    def test_casefold(self):
825        self.assertEqual('hello'.casefold(), 'hello')
826        self.assertEqual('hELlo'.casefold(), 'hello')
827        self.assertEqual('ß'.casefold(), 'ss')
828        self.assertEqual('fi'.casefold(), 'fi')
829        self.assertEqual('\u03a3'.casefold(), '\u03c3')
830        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
831        self.assertEqual('\u00b5'.casefold(), '\u03bc')
832
833    def test_upper(self):
834        string_tests.CommonTest.test_upper(self)
835        self.assertEqual('\U0001044F'.upper(), '\U00010427')
836        self.assertEqual('\U0001044F\U0001044F'.upper(),
837                         '\U00010427\U00010427')
838        self.assertEqual('\U00010427\U0001044F'.upper(),
839                         '\U00010427\U00010427')
840        self.assertEqual('X\U00010427x\U0001044F'.upper(),
841                         'X\U00010427X\U00010427')
842        self.assertEqual('fi'.upper(), 'FI')
843        self.assertEqual('\u0130'.upper(), '\u0130')
844        self.assertEqual('\u03a3'.upper(), '\u03a3')
845        self.assertEqual('ß'.upper(), 'SS')
846        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
847        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
848        self.assertEqual('\u2177'.upper(), '\u2167')
849
850    def test_capitalize(self):
851        string_tests.CommonTest.test_capitalize(self)
852        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
853        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
854                         '\U00010427\U0001044F')
855        self.assertEqual('\U00010427\U0001044F'.capitalize(),
856                         '\U00010427\U0001044F')
857        self.assertEqual('\U0001044F\U00010427'.capitalize(),
858                         '\U00010427\U0001044F')
859        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
860                         'X\U0001044Fx\U0001044F')
861        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
862        exp = '\u0399\u0308\u0300\u0069\u0307'
863        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
864        self.assertEqual('finnish'.capitalize(), 'Finnish')
865        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
866
867    def test_title(self):
868        super().test_title()
869        self.assertEqual('\U0001044F'.title(), '\U00010427')
870        self.assertEqual('\U0001044F\U0001044F'.title(),
871                         '\U00010427\U0001044F')
872        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
873                         '\U00010427\U0001044F \U00010427\U0001044F')
874        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
875                         '\U00010427\U0001044F \U00010427\U0001044F')
876        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
877                         '\U00010427\U0001044F \U00010427\U0001044F')
878        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
879                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
880        self.assertEqual('fiNNISH'.title(), 'Finnish')
881        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
882        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
883
884    def test_swapcase(self):
885        string_tests.CommonTest.test_swapcase(self)
886        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
887        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
888        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
889                         '\U00010427\U00010427')
890        self.assertEqual('\U00010427\U0001044F'.swapcase(),
891                         '\U0001044F\U00010427')
892        self.assertEqual('\U0001044F\U00010427'.swapcase(),
893                         '\U00010427\U0001044F')
894        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
895                         'x\U0001044FX\U00010427')
896        self.assertEqual('fi'.swapcase(), 'FI')
897        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
898        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
899        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
900        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
901        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
902        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
903        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
904        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
905        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
906        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
907        self.assertEqual('ß'.swapcase(), 'SS')
908        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
909
910    def test_center(self):
911        string_tests.CommonTest.test_center(self)
912        self.assertEqual('x'.center(2, '\U0010FFFF'),
913                         'x\U0010FFFF')
914        self.assertEqual('x'.center(3, '\U0010FFFF'),
915                         '\U0010FFFFx\U0010FFFF')
916        self.assertEqual('x'.center(4, '\U0010FFFF'),
917                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
918
919    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
920    @support.cpython_only
921    def test_case_operation_overflow(self):
922        # Issue #22643
923        size = 2**32//12 + 1
924        try:
925            s = "ü" * size
926        except MemoryError:
927            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
928        try:
929            self.assertRaises(OverflowError, s.upper)
930        finally:
931            del s
932
933    def test_contains(self):
934        # Testing Unicode contains method
935        self.assertIn('a', 'abdb')
936        self.assertIn('a', 'bdab')
937        self.assertIn('a', 'bdaba')
938        self.assertIn('a', 'bdba')
939        self.assertNotIn('a', 'bdb')
940        self.assertIn('a', 'bdba')
941        self.assertIn('a', ('a',1,None))
942        self.assertIn('a', (1,None,'a'))
943        self.assertIn('a', ('a',1,None))
944        self.assertIn('a', (1,None,'a'))
945        self.assertNotIn('a', ('x',1,'y'))
946        self.assertNotIn('a', ('x',1,None))
947        self.assertNotIn('abcd', 'abcxxxx')
948        self.assertIn('ab', 'abcd')
949        self.assertIn('ab', 'abc')
950        self.assertIn('ab', (1,None,'ab'))
951        self.assertIn('', 'abc')
952        self.assertIn('', '')
953        self.assertIn('', 'abc')
954        self.assertNotIn('\0', 'abc')
955        self.assertIn('\0', '\0abc')
956        self.assertIn('\0', 'abc\0')
957        self.assertIn('a', '\0abc')
958        self.assertIn('asdf', 'asdf')
959        self.assertNotIn('asdf', 'asd')
960        self.assertNotIn('asdf', '')
961
962        self.assertRaises(TypeError, "abc".__contains__)
963        # test mixed kinds
964        for fill in ('a', '\u0100', '\U00010300'):
965            fill *= 9
966            for delim in ('c', '\u0102', '\U00010302'):
967                self.assertNotIn(delim, fill)
968                self.assertIn(delim, fill + delim)
969                self.assertNotIn(delim * 2, fill)
970                self.assertIn(delim * 2, fill + delim * 2)
971
972    def test_issue18183(self):
973        '\U00010000\U00100000'.lower()
974        '\U00010000\U00100000'.casefold()
975        '\U00010000\U00100000'.upper()
976        '\U00010000\U00100000'.capitalize()
977        '\U00010000\U00100000'.title()
978        '\U00010000\U00100000'.swapcase()
979        '\U00100000'.center(3, '\U00010000')
980        '\U00100000'.ljust(3, '\U00010000')
981        '\U00100000'.rjust(3, '\U00010000')
982
983    def test_format(self):
984        self.assertEqual(''.format(), '')
985        self.assertEqual('a'.format(), 'a')
986        self.assertEqual('ab'.format(), 'ab')
987        self.assertEqual('a{{'.format(), 'a{')
988        self.assertEqual('a}}'.format(), 'a}')
989        self.assertEqual('{{b'.format(), '{b')
990        self.assertEqual('}}b'.format(), '}b')
991        self.assertEqual('a{{b'.format(), 'a{b')
992
993        # examples from the PEP:
994        import datetime
995        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
996        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
997                         "My name is Fred")
998        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
999                         "My name is Fred :-{}")
1000
1001        d = datetime.date(2007, 8, 18)
1002        self.assertEqual("The year is {0.year}".format(d),
1003                         "The year is 2007")
1004
1005        # classes we'll use for testing
1006        class C:
1007            def __init__(self, x=100):
1008                self._x = x
1009            def __format__(self, spec):
1010                return spec
1011
1012        class D:
1013            def __init__(self, x):
1014                self.x = x
1015            def __format__(self, spec):
1016                return str(self.x)
1017
1018        # class with __str__, but no __format__
1019        class E:
1020            def __init__(self, x):
1021                self.x = x
1022            def __str__(self):
1023                return 'E(' + self.x + ')'
1024
1025        # class with __repr__, but no __format__ or __str__
1026        class F:
1027            def __init__(self, x):
1028                self.x = x
1029            def __repr__(self):
1030                return 'F(' + self.x + ')'
1031
1032        # class with __format__ that forwards to string, for some format_spec's
1033        class G:
1034            def __init__(self, x):
1035                self.x = x
1036            def __str__(self):
1037                return "string is " + self.x
1038            def __format__(self, format_spec):
1039                if format_spec == 'd':
1040                    return 'G(' + self.x + ')'
1041                return object.__format__(self, format_spec)
1042
1043        class I(datetime.date):
1044            def __format__(self, format_spec):
1045                return self.strftime(format_spec)
1046
1047        class J(int):
1048            def __format__(self, format_spec):
1049                return int.__format__(self * 2, format_spec)
1050
1051        class M:
1052            def __init__(self, x):
1053                self.x = x
1054            def __repr__(self):
1055                return 'M(' + self.x + ')'
1056            __str__ = None
1057
1058        class N:
1059            def __init__(self, x):
1060                self.x = x
1061            def __repr__(self):
1062                return 'N(' + self.x + ')'
1063            __format__ = None
1064
1065        self.assertEqual(''.format(), '')
1066        self.assertEqual('abc'.format(), 'abc')
1067        self.assertEqual('{0}'.format('abc'), 'abc')
1068        self.assertEqual('{0:}'.format('abc'), 'abc')
1069#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1070        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1071        self.assertEqual('{0}X'.format('abc'), 'abcX')
1072        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1073        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1074        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1075        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1076        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1077        self.assertEqual('{0}'.format(-15), '-15')
1078        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1079        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1080        self.assertEqual('{{'.format(), '{')
1081        self.assertEqual('}}'.format(), '}')
1082        self.assertEqual('{{}}'.format(), '{}')
1083        self.assertEqual('{{x}}'.format(), '{x}')
1084        self.assertEqual('{{{0}}}'.format(123), '{123}')
1085        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1086        self.assertEqual('}}{{'.format(), '}{')
1087        self.assertEqual('}}x{{'.format(), '}x{')
1088
1089        # weird field names
1090        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1091        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1092        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1093
1094        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1095        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1096        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1097        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1098        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1099        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1100        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1101
1102        # strings
1103        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1104        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1105        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1106        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1107        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1108        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1109        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1110        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1111        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1112        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1113        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1114        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1115        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1116        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1117        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1118        self.assertEqual('{0:>7s}'.format('result'), ' result')
1119        self.assertEqual('{0:>8s}'.format('result'), '  result')
1120        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1121        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1122        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1123        self.assertEqual('{0:8s}'.format('result'), 'result  ')
1124        self.assertEqual('{0:0s}'.format('result'), 'result')
1125        self.assertEqual('{0:08s}'.format('result'), 'result00')
1126        self.assertEqual('{0:<08s}'.format('result'), 'result00')
1127        self.assertEqual('{0:>08s}'.format('result'), '00result')
1128        self.assertEqual('{0:^08s}'.format('result'), '0result0')
1129        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1130        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1131        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1132
1133        # issue 12546: use \x00 as a fill character
1134        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1135        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1136        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1137        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1138
1139        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1140        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1141        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1142        self.assertEqual('{0:<6}'.format(3), '3     ')
1143
1144        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1145        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1146        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1147        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1148
1149        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1150        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1151        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1152        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1153
1154        # format specifiers for user defined type
1155        self.assertEqual('{0:abc}'.format(C()), 'abc')
1156
1157        # !r, !s and !a coercions
1158        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1159        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1160        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1161        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1162        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1163        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1164        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1165        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1166        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1167        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1168        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1169        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1170        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1171        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1172        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1173        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1174
1175        # test fallback to object.__format__
1176        self.assertEqual('{0}'.format({}), '{}')
1177        self.assertEqual('{0}'.format([]), '[]')
1178        self.assertEqual('{0}'.format([1]), '[1]')
1179
1180        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1181        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1182
1183        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1184        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1185        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1186
1187        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1188                                                       month=8,
1189                                                       day=27)),
1190                         "date: 2007-08-27")
1191
1192        # test deriving from a builtin type and overriding __format__
1193        self.assertEqual("{0}".format(J(10)), "20")
1194
1195
1196        # string format specifiers
1197        self.assertEqual('{0:}'.format('a'), 'a')
1198
1199        # computed format specifiers
1200        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1201        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1202        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1203        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1204        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1205
1206        # test various errors
1207        self.assertRaises(ValueError, '{'.format)
1208        self.assertRaises(ValueError, '}'.format)
1209        self.assertRaises(ValueError, 'a{'.format)
1210        self.assertRaises(ValueError, 'a}'.format)
1211        self.assertRaises(ValueError, '{a'.format)
1212        self.assertRaises(ValueError, '}a'.format)
1213        self.assertRaises(IndexError, '{0}'.format)
1214        self.assertRaises(IndexError, '{1}'.format, 'abc')
1215        self.assertRaises(KeyError,   '{x}'.format)
1216        self.assertRaises(ValueError, "}{".format)
1217        self.assertRaises(ValueError, "abc{0:{}".format)
1218        self.assertRaises(ValueError, "{0".format)
1219        self.assertRaises(IndexError, "{0.}".format)
1220        self.assertRaises(ValueError, "{0.}".format, 0)
1221        self.assertRaises(ValueError, "{0[}".format)
1222        self.assertRaises(ValueError, "{0[}".format, [])
1223        self.assertRaises(KeyError,   "{0]}".format)
1224        self.assertRaises(ValueError, "{0.[]}".format, 0)
1225        self.assertRaises(ValueError, "{0..foo}".format, 0)
1226        self.assertRaises(ValueError, "{0[0}".format, 0)
1227        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1228        self.assertRaises(KeyError,   "{c]}".format)
1229        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1230        self.assertRaises(ValueError, "{0}}".format, 0)
1231        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1232        self.assertRaises(ValueError, "{0!x}".format, 3)
1233        self.assertRaises(ValueError, "{0!}".format, 0)
1234        self.assertRaises(ValueError, "{0!rs}".format, 0)
1235        self.assertRaises(ValueError, "{!}".format)
1236        self.assertRaises(IndexError, "{:}".format)
1237        self.assertRaises(IndexError, "{:s}".format)
1238        self.assertRaises(IndexError, "{}".format)
1239        big = "23098475029384702983476098230754973209482573"
1240        self.assertRaises(ValueError, ("{" + big + "}").format)
1241        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1242
1243        # issue 6089
1244        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1245        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1246
1247        # can't have a replacement on the field name portion
1248        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1249
1250        # exceed maximum recursion depth
1251        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1252        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1253                          0, 1, 2, 3, 4, 5, 6, 7)
1254
1255        # string format spec errors
1256        sign_msg = "Sign not allowed in string format specifier"
1257        self.assertRaisesRegex(ValueError, sign_msg, "{0:-s}".format, '')
1258        self.assertRaisesRegex(ValueError, sign_msg, format, "", "-")
1259        space_msg = "Space not allowed in string format specifier"
1260        self.assertRaisesRegex(ValueError, space_msg, "{: }".format, '')
1261        self.assertRaises(ValueError, "{0:=s}".format, '')
1262
1263        # Alternate formatting is not supported
1264        self.assertRaises(ValueError, format, '', '#')
1265        self.assertRaises(ValueError, format, '', '#20')
1266
1267        # Non-ASCII
1268        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1269                         'ABC\u0410\u0411\u0412')
1270        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1271                         'ABC')
1272        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1273                         '')
1274
1275        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1276        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1277        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1278        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1279        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1280        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1281        self.assertRaises(ValueError, "{a{}b}".format, 42)
1282        self.assertRaises(ValueError, "{a{b}".format, 42)
1283        self.assertRaises(ValueError, "{[}".format, 42)
1284
1285        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1286
1287        # Blocking fallback
1288        m = M('data')
1289        self.assertEqual("{!r}".format(m), 'M(data)')
1290        self.assertRaises(TypeError, "{!s}".format, m)
1291        self.assertRaises(TypeError, "{}".format, m)
1292        n = N('data')
1293        self.assertEqual("{!r}".format(n), 'N(data)')
1294        self.assertEqual("{!s}".format(n), 'N(data)')
1295        self.assertRaises(TypeError, "{}".format, n)
1296
1297    def test_format_map(self):
1298        self.assertEqual(''.format_map({}), '')
1299        self.assertEqual('a'.format_map({}), 'a')
1300        self.assertEqual('ab'.format_map({}), 'ab')
1301        self.assertEqual('a{{'.format_map({}), 'a{')
1302        self.assertEqual('a}}'.format_map({}), 'a}')
1303        self.assertEqual('{{b'.format_map({}), '{b')
1304        self.assertEqual('}}b'.format_map({}), '}b')
1305        self.assertEqual('a{{b'.format_map({}), 'a{b')
1306
1307        # using mappings
1308        class Mapping(dict):
1309            def __missing__(self, key):
1310                return key
1311        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1312        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1313
1314        class InternalMapping:
1315            def __init__(self):
1316                self.mapping = {'a': 'hello'}
1317            def __getitem__(self, key):
1318                return self.mapping[key]
1319        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1320
1321
1322        class C:
1323            def __init__(self, x=100):
1324                self._x = x
1325            def __format__(self, spec):
1326                return spec
1327        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1328
1329        # test various errors
1330        self.assertRaises(TypeError, ''.format_map)
1331        self.assertRaises(TypeError, 'a'.format_map)
1332
1333        self.assertRaises(ValueError, '{'.format_map, {})
1334        self.assertRaises(ValueError, '}'.format_map, {})
1335        self.assertRaises(ValueError, 'a{'.format_map, {})
1336        self.assertRaises(ValueError, 'a}'.format_map, {})
1337        self.assertRaises(ValueError, '{a'.format_map, {})
1338        self.assertRaises(ValueError, '}a'.format_map, {})
1339
1340        # issue #12579: can't supply positional params to format_map
1341        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1342        self.assertRaises(ValueError, '{}'.format_map, 'a')
1343        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1344
1345        class BadMapping:
1346            def __getitem__(self, key):
1347                return 1/0
1348        self.assertRaises(KeyError, '{a}'.format_map, {})
1349        self.assertRaises(TypeError, '{a}'.format_map, [])
1350        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1351
1352    def test_format_huge_precision(self):
1353        format_string = ".{}f".format(sys.maxsize + 1)
1354        with self.assertRaises(ValueError):
1355            result = format(2.34, format_string)
1356
1357    def test_format_huge_width(self):
1358        format_string = "{}f".format(sys.maxsize + 1)
1359        with self.assertRaises(ValueError):
1360            result = format(2.34, format_string)
1361
1362    def test_format_huge_item_number(self):
1363        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1364        with self.assertRaises(ValueError):
1365            result = format_string.format(2.34)
1366
1367    def test_format_auto_numbering(self):
1368        class C:
1369            def __init__(self, x=100):
1370                self._x = x
1371            def __format__(self, spec):
1372                return spec
1373
1374        self.assertEqual('{}'.format(10), '10')
1375        self.assertEqual('{:5}'.format('s'), 's    ')
1376        self.assertEqual('{!r}'.format('s'), "'s'")
1377        self.assertEqual('{._x}'.format(C(10)), '10')
1378        self.assertEqual('{[1]}'.format([1, 2]), '2')
1379        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1380        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1381
1382        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1383        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1384
1385        # can't mix and match numbering and auto-numbering
1386        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1387        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1388        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1389        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1390
1391        # can mix and match auto-numbering and named
1392        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1393        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1394        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1395        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1396
1397    def test_formatting(self):
1398        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1399        # Testing Unicode formatting strings...
1400        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1401        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1402        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1403        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1404        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1405        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1406        if not sys.platform.startswith('java'):
1407            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1408            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1409            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1410        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1411        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1412
1413        self.assertEqual('%c' % 0x1234, '\u1234')
1414        self.assertEqual('%c' % 0x21483, '\U00021483')
1415        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1416        self.assertEqual('%c' % '\U00021483', '\U00021483')
1417        self.assertRaises(TypeError, "%c".__mod__, "aa")
1418        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1419        self.assertRaises(TypeError, "%i".__mod__, "aa")
1420
1421        # formatting jobs delegated from the string implementation:
1422        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1423        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1424        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1425        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1426        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1427        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1428        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1429        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1430        self.assertEqual('...%s...' % "abc", '...abc...')
1431        self.assertEqual('%*s' % (5,'abc',), '  abc')
1432        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1433        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1434        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1435        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1436        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1437        self.assertEqual('%c' % 'a', 'a')
1438        class Wrapper:
1439            def __str__(self):
1440                return '\u1234'
1441        self.assertEqual('%s' % Wrapper(), '\u1234')
1442
1443        # issue 3382
1444        NAN = float('nan')
1445        INF = float('inf')
1446        self.assertEqual('%f' % NAN, 'nan')
1447        self.assertEqual('%F' % NAN, 'NAN')
1448        self.assertEqual('%f' % INF, 'inf')
1449        self.assertEqual('%F' % INF, 'INF')
1450
1451        # PEP 393
1452        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1453        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1454
1455        #issue 19995
1456        class PseudoInt:
1457            def __init__(self, value):
1458                self.value = int(value)
1459            def __int__(self):
1460                return self.value
1461            def __index__(self):
1462                return self.value
1463        class PseudoFloat:
1464            def __init__(self, value):
1465                self.value = float(value)
1466            def __int__(self):
1467                return int(self.value)
1468        pi = PseudoFloat(3.1415)
1469        letter_m = PseudoInt(109)
1470        self.assertEqual('%x' % 42, '2a')
1471        self.assertEqual('%X' % 15, 'F')
1472        self.assertEqual('%o' % 9, '11')
1473        self.assertEqual('%c' % 109, 'm')
1474        self.assertEqual('%x' % letter_m, '6d')
1475        self.assertEqual('%X' % letter_m, '6D')
1476        self.assertEqual('%o' % letter_m, '155')
1477        self.assertEqual('%c' % letter_m, 'm')
1478        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1479        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1480        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1481        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1482        self.assertRaises(TypeError, operator.mod, '%c', pi),
1483
1484    def test_formatting_with_enum(self):
1485        # issue18780
1486        import enum
1487        class Float(float, enum.Enum):
1488            PI = 3.1415926
1489        class Int(enum.IntEnum):
1490            IDES = 15
1491        class Str(str, enum.Enum):
1492            ABC = 'abc'
1493        # Testing Unicode formatting strings...
1494        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1495                         'Str.ABC, Str.ABC')
1496        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1497                        (Str.ABC, Str.ABC,
1498                         Int.IDES, Int.IDES, Int.IDES,
1499                         Float.PI, Float.PI),
1500                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1501
1502        # formatting jobs delegated from the string implementation:
1503        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1504                         '...Str.ABC...')
1505        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1506                         '...Int.IDES...')
1507        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1508                         '...15...')
1509        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1510                         '...15...')
1511        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1512                         '...15...')
1513        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1514                         '...3.141593...')
1515
1516    def test_formatting_huge_precision(self):
1517        format_string = "%.{}f".format(sys.maxsize + 1)
1518        with self.assertRaises(ValueError):
1519            result = format_string % 2.34
1520
1521    def test_issue28598_strsubclass_rhs(self):
1522        # A subclass of str with an __rmod__ method should be able to hook
1523        # into the % operator
1524        class SubclassedStr(str):
1525            def __rmod__(self, other):
1526                return 'Success, self.__rmod__({!r}) was called'.format(other)
1527        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1528                         "Success, self.__rmod__('lhs %% %r') was called")
1529
1530    @support.cpython_only
1531    def test_formatting_huge_precision_c_limits(self):
1532        from _testcapi import INT_MAX
1533        format_string = "%.{}f".format(INT_MAX + 1)
1534        with self.assertRaises(ValueError):
1535            result = format_string % 2.34
1536
1537    def test_formatting_huge_width(self):
1538        format_string = "%{}f".format(sys.maxsize + 1)
1539        with self.assertRaises(ValueError):
1540            result = format_string % 2.34
1541
1542    def test_startswith_endswith_errors(self):
1543        for meth in ('foo'.startswith, 'foo'.endswith):
1544            with self.assertRaises(TypeError) as cm:
1545                meth(['f'])
1546            exc = str(cm.exception)
1547            self.assertIn('str', exc)
1548            self.assertIn('tuple', exc)
1549
1550    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1551    def test_format_float(self):
1552        # should not format with a comma, but always with C locale
1553        self.assertEqual('1.0', '%.1f' % 1.0)
1554
1555    def test_constructor(self):
1556        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1557
1558        self.assertEqual(
1559            str('unicode remains unicode'),
1560            'unicode remains unicode'
1561        )
1562
1563        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1564            subclass = StrSubclass(text)
1565            self.assertEqual(str(subclass), text)
1566            self.assertEqual(len(subclass), len(text))
1567            if text == 'ascii':
1568                self.assertEqual(subclass.encode('ascii'), b'ascii')
1569                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1570
1571        self.assertEqual(
1572            str('strings are converted to unicode'),
1573            'strings are converted to unicode'
1574        )
1575
1576        class StringCompat:
1577            def __init__(self, x):
1578                self.x = x
1579            def __str__(self):
1580                return self.x
1581
1582        self.assertEqual(
1583            str(StringCompat('__str__ compatible objects are recognized')),
1584            '__str__ compatible objects are recognized'
1585        )
1586
1587        # unicode(obj) is compatible to str():
1588
1589        o = StringCompat('unicode(obj) is compatible to str()')
1590        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1591        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1592
1593        for obj in (123, 123.45, 123):
1594            self.assertEqual(str(obj), str(str(obj)))
1595
1596        # unicode(obj, encoding, error) tests (this maps to
1597        # PyUnicode_FromEncodedObject() at C level)
1598
1599        if not sys.platform.startswith('java'):
1600            self.assertRaises(
1601                TypeError,
1602                str,
1603                'decoding unicode is not supported',
1604                'utf-8',
1605                'strict'
1606            )
1607
1608        self.assertEqual(
1609            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1610            'strings are decoded to unicode'
1611        )
1612
1613        if not sys.platform.startswith('java'):
1614            self.assertEqual(
1615                str(
1616                    memoryview(b'character buffers are decoded to unicode'),
1617                    'utf-8',
1618                    'strict'
1619                ),
1620                'character buffers are decoded to unicode'
1621            )
1622
1623        self.assertRaises(TypeError, str, 42, 42, 42)
1624
1625    def test_constructor_keyword_args(self):
1626        """Pass various keyword argument combinations to the constructor."""
1627        # The object argument can be passed as a keyword.
1628        self.assertEqual(str(object='foo'), 'foo')
1629        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1630        # The errors argument without encoding triggers "decode" mode.
1631        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1632        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1633
1634    def test_constructor_defaults(self):
1635        """Check the constructor argument defaults."""
1636        # The object argument defaults to '' or b''.
1637        self.assertEqual(str(), '')
1638        self.assertEqual(str(errors='strict'), '')
1639        utf8_cent = '¢'.encode('utf-8')
1640        # The encoding argument defaults to utf-8.
1641        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1642        # The errors argument defaults to strict.
1643        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1644
1645    def test_codecs_utf7(self):
1646        utfTests = [
1647            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1648            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1649            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1650            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1651            ('+', b'+-'),
1652            ('+-', b'+--'),
1653            ('+?', b'+-?'),
1654            (r'\?', b'+AFw?'),
1655            ('+?', b'+-?'),
1656            (r'\\?', b'+AFwAXA?'),
1657            (r'\\\?', b'+AFwAXABc?'),
1658            (r'++--', b'+-+---'),
1659            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1660            ('/', b'/'),
1661        ]
1662
1663        for (x, y) in utfTests:
1664            self.assertEqual(x.encode('utf-7'), y)
1665
1666        # Unpaired surrogates are passed through
1667        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1668        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1669        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1670        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1671        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1672        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1673        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1674        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1675
1676        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1677        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1678
1679        # Issue #2242: crash on some Windows/MSVC versions
1680        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1681
1682        # Direct encoded characters
1683        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1684        # Optional direct characters
1685        set_o = '!"#$%&*;<=>@[]^_`{|}'
1686        for c in set_d:
1687            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1688            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1689        for c in set_o:
1690            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1691
1692        with self.assertRaisesRegex(UnicodeDecodeError,
1693                                    'ill-formed sequence'):
1694            b'+@'.decode('utf-7')
1695
1696    def test_codecs_utf8(self):
1697        self.assertEqual(''.encode('utf-8'), b'')
1698        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1699        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1700        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1701        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1702        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1703        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1704                         b'\xf0\x90\x80\x82'*10)
1705        self.assertEqual(
1706            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1707            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1708            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1709            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1710            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1711            ' Nunstuck git und'.encode('utf-8'),
1712            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1713            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1714            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1715            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1716            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1717            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1718            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1719            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1720            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1721            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1722        )
1723
1724        # UTF-8 specific decoding tests
1725        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1726        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1727        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1728
1729        # Other possible utf-8 test cases:
1730        # * strict decoding testing for all of the
1731        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1732
1733    def test_utf8_decode_valid_sequences(self):
1734        sequences = [
1735            # single byte
1736            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1737            # 2 bytes
1738            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1739            # 3 bytes
1740            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1741            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1742            # 4 bytes
1743            (b'\xF0\x90\x80\x80', '\U00010000'),
1744            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1745        ]
1746        for seq, res in sequences:
1747            self.assertEqual(seq.decode('utf-8'), res)
1748
1749
1750    def test_utf8_decode_invalid_sequences(self):
1751        # continuation bytes in a sequence of 2, 3, or 4 bytes
1752        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1753        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1754        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1755        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1756        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1757        invalid_start_bytes = (
1758            continuation_bytes + invalid_2B_seq_start_bytes +
1759            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1760        )
1761
1762        for byte in invalid_start_bytes:
1763            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1764
1765        for sb in invalid_2B_seq_start_bytes:
1766            for cb in continuation_bytes:
1767                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1768
1769        for sb in invalid_4B_seq_start_bytes:
1770            for cb1 in continuation_bytes[:3]:
1771                for cb3 in continuation_bytes[:3]:
1772                    self.assertRaises(UnicodeDecodeError,
1773                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1774
1775        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1776            self.assertRaises(UnicodeDecodeError,
1777                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1778            self.assertRaises(UnicodeDecodeError,
1779                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1780        # surrogates
1781        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1782            self.assertRaises(UnicodeDecodeError,
1783                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1784            self.assertRaises(UnicodeDecodeError,
1785                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1786        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1787            self.assertRaises(UnicodeDecodeError,
1788                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1789            self.assertRaises(UnicodeDecodeError,
1790                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1791        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1792            self.assertRaises(UnicodeDecodeError,
1793                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1794            self.assertRaises(UnicodeDecodeError,
1795                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1796
1797    def test_issue8271(self):
1798        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1799        # only the start byte and the continuation byte(s) are now considered
1800        # invalid, instead of the number of bytes specified by the start byte.
1801        # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1802        # table 3-8, Row 2) for more information about the algorithm used.
1803        FFFD = '\ufffd'
1804        sequences = [
1805            # invalid start bytes
1806            (b'\x80', FFFD), # continuation byte
1807            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1808            (b'\xc0', FFFD),
1809            (b'\xc0\xc0', FFFD*2),
1810            (b'\xc1', FFFD),
1811            (b'\xc1\xc0', FFFD*2),
1812            (b'\xc0\xc1', FFFD*2),
1813            # with start byte of a 2-byte sequence
1814            (b'\xc2', FFFD), # only the start byte
1815            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1816            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1817            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1818            # with start byte of a 3-byte sequence
1819            (b'\xe1', FFFD), # only the start byte
1820            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1821            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1822            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1823            (b'\xe1\x80', FFFD), # only 1 continuation byte
1824            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1825            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1826            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1827            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1828            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1829            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1830            # with start byte of a 4-byte sequence
1831            (b'\xf1', FFFD), # only the start byte
1832            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1833            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1834            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1835            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1836            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1837            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1838            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1839            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1840            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1841            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1842            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1843            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1844            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1845            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1846            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1847            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1848            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1849            # with invalid start byte of a 4-byte sequence (rfc2279)
1850            (b'\xf5', FFFD), # only the start byte
1851            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1852            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1853            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1854            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1855            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1856            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1857            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1858            # with invalid start byte of a 5-byte sequence (rfc2279)
1859            (b'\xf8', FFFD), # only the start byte
1860            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1861            (b'\xf8\x80', FFFD*2), # only one continuation byte
1862            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1863            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1864            # with invalid start byte of a 6-byte sequence (rfc2279)
1865            (b'\xfc', FFFD), # only the start byte
1866            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1867            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1868            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1869            # invalid start byte
1870            (b'\xfe', FFFD),
1871            (b'\xfe\x80\x80', FFFD*3),
1872            # other sequences
1873            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1874            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1875            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1876            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1877             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1878        ]
1879        for n, (seq, res) in enumerate(sequences):
1880            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1881            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1882            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1883            self.assertEqual(seq.decode('utf-8', 'ignore'),
1884                             res.replace('\uFFFD', ''))
1885
1886    def assertCorrectUTF8Decoding(self, seq, res, err):
1887        """
1888        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1889        'strict' is used, returns res when 'replace' is used, and that doesn't
1890        return anything when 'ignore' is used.
1891        """
1892        with self.assertRaises(UnicodeDecodeError) as cm:
1893            seq.decode('utf-8')
1894        exc = cm.exception
1895
1896        self.assertIn(err, str(exc))
1897        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1898        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1899                         'aaaa' + res + 'bbbb')
1900        res = res.replace('\ufffd', '')
1901        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1902        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1903                          'aaaa' + res + 'bbbb')
1904
1905    def test_invalid_start_byte(self):
1906        """
1907        Test that an 'invalid start byte' error is raised when the first byte
1908        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1909        4-bytes sequence. The invalid start byte is replaced with a single
1910        U+FFFD when errors='replace'.
1911        E.g. <80> is a continuation byte and can appear only after a start byte.
1912        """
1913        FFFD = '\ufffd'
1914        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1915            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1916                                           'invalid start byte')
1917
1918    def test_unexpected_end_of_data(self):
1919        """
1920        Test that an 'unexpected end of data' error is raised when the string
1921        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1922        enough continuation bytes.  The incomplete sequence is replaced with a
1923        single U+FFFD when errors='replace'.
1924        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1925        sequence, but it's followed by only 2 valid continuation bytes and the
1926        last continuation bytes is missing.
1927        Note: the continuation bytes must be all valid, if one of them is
1928        invalid another error will be raised.
1929        """
1930        sequences = [
1931            'C2', 'DF',
1932            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1933            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1934            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1935            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1936            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1937            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1938        ]
1939        FFFD = '\ufffd'
1940        for seq in sequences:
1941            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1942                                           'unexpected end of data')
1943
1944    def test_invalid_cb_for_2bytes_seq(self):
1945        """
1946        Test that an 'invalid continuation byte' error is raised when the
1947        continuation byte of a 2-bytes sequence is invalid.  The start byte
1948        is replaced by a single U+FFFD and the second byte is handled
1949        separately when errors='replace'.
1950        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1951        sequence, but 41 is not a valid continuation byte because it's the
1952        ASCII letter 'A'.
1953        """
1954        FFFD = '\ufffd'
1955        FFFDx2 = FFFD * 2
1956        sequences = [
1957            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1958            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1959            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1960            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1961        ]
1962        for seq, res in sequences:
1963            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1964                                           'invalid continuation byte')
1965
1966    def test_invalid_cb_for_3bytes_seq(self):
1967        """
1968        Test that an 'invalid continuation byte' error is raised when the
1969        continuation byte(s) of a 3-bytes sequence are invalid.  When
1970        errors='replace', if the first continuation byte is valid, the first
1971        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1972        third byte is handled separately, otherwise only the start byte is
1973        replaced with a U+FFFD and the other continuation bytes are handled
1974        separately.
1975        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1976        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1977        because it's the ASCII letter 'A'.
1978        Note: when the start byte is E0 or ED, the valid ranges for the first
1979        continuation byte are limited to A0..BF and 80..9F respectively.
1980        Python 2 used to consider all the bytes in range 80..BF valid when the
1981        start byte was ED.  This is fixed in Python 3.
1982        """
1983        FFFD = '\ufffd'
1984        FFFDx2 = FFFD * 2
1985        sequences = [
1986            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1987            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1988            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1989            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1990            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1991            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1992            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1993            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1994            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1995            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1996            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1997            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1998            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1999            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
2000            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
2001            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
2002            ('ED 7F', FFFD+'\x7f'),
2003            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
2004            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
2005            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
2006            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
2007            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
2008            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
2009            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
2010            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
2011            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
2012            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
2013            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
2014            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
2015            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
2016            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
2017            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
2018            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
2019        ]
2020        for seq, res in sequences:
2021            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2022                                           'invalid continuation byte')
2023
2024    def test_invalid_cb_for_4bytes_seq(self):
2025        """
2026        Test that an 'invalid continuation byte' error is raised when the
2027        continuation byte(s) of a 4-bytes sequence are invalid.  When
2028        errors='replace',the start byte and all the following valid
2029        continuation bytes are replaced with a single U+FFFD, and all the bytes
2030        starting from the first invalid continuation bytes (included) are
2031        handled separately.
2032        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
2033        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
2034        because it's the ASCII letter 'A'.
2035        Note: when the start byte is E0 or ED, the valid ranges for the first
2036        continuation byte are limited to A0..BF and 80..9F respectively.
2037        However, when the start byte is ED, Python 2 considers all the bytes
2038        in range 80..BF valid.  This is fixed in Python 3.
2039        """
2040        FFFD = '\ufffd'
2041        FFFDx2 = FFFD * 2
2042        sequences = [
2043            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2044            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2045            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2046            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2047            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2048            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2049            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2050            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2051            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2052            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2053            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2054            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2055            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2056            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2057            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2058            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2059            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2060            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2061            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2062            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2063            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2064            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2065            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2066            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2067            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2068            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2069            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2070            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2071            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2072            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2073            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2074            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2075            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2076            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2077            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2078            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2079            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2080            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2081            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2082            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2083            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2084            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2085            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2086            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2087            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2088            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2089            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2090            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2091            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2092            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2093            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2094            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2095            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2096            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2097            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2098        ]
2099        for seq, res in sequences:
2100            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2101                                           'invalid continuation byte')
2102
2103    def test_codecs_idna(self):
2104        # Test whether trailing dot is preserved
2105        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2106
2107    def test_codecs_errors(self):
2108        # Error handling (encoding)
2109        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2110        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2111        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2112        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2113        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2114                         'Andr\202 x'.encode('ascii', errors='replace'))
2115        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2116                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2117
2118        # Error handling (decoding)
2119        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2120        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2121        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2122        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2123        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2124
2125        # Error handling (unknown character names)
2126        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2127
2128        # Error handling (truncated escape sequence)
2129        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2130
2131        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2132        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2133        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2134        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2135
2136        # Error handling (wrong arguments)
2137        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2138
2139        # Error handling (lone surrogate in
2140        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2141        self.assertRaises(ValueError, int, "\ud800")
2142        self.assertRaises(ValueError, int, "\udf00")
2143        self.assertRaises(ValueError, float, "\ud800")
2144        self.assertRaises(ValueError, float, "\udf00")
2145        self.assertRaises(ValueError, complex, "\ud800")
2146        self.assertRaises(ValueError, complex, "\udf00")
2147
2148    def test_codecs(self):
2149        # Encoding
2150        self.assertEqual('hello'.encode('ascii'), b'hello')
2151        self.assertEqual('hello'.encode('utf-7'), b'hello')
2152        self.assertEqual('hello'.encode('utf-8'), b'hello')
2153        self.assertEqual('hello'.encode('utf-8'), b'hello')
2154        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2155        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2156        self.assertEqual('hello'.encode('latin-1'), b'hello')
2157
2158        # Default encoding is utf-8
2159        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2160
2161        # Roundtrip safety for BMP (just the first 1024 chars)
2162        for c in range(1024):
2163            u = chr(c)
2164            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2165                             'utf-16-be', 'raw_unicode_escape',
2166                             'unicode_escape'):
2167                self.assertEqual(str(u.encode(encoding),encoding), u)
2168
2169        # Roundtrip safety for BMP (just the first 256 chars)
2170        for c in range(256):
2171            u = chr(c)
2172            for encoding in ('latin-1',):
2173                self.assertEqual(str(u.encode(encoding),encoding), u)
2174
2175        # Roundtrip safety for BMP (just the first 128 chars)
2176        for c in range(128):
2177            u = chr(c)
2178            for encoding in ('ascii',):
2179                self.assertEqual(str(u.encode(encoding),encoding), u)
2180
2181        # Roundtrip safety for non-BMP (just a few chars)
2182        with warnings.catch_warnings():
2183            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2184            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2185                             'raw_unicode_escape', 'unicode_escape'):
2186                self.assertEqual(str(u.encode(encoding),encoding), u)
2187
2188        # UTF-8 must be roundtrip safe for all code points
2189        # (except surrogates, which are forbidden).
2190        u = ''.join(map(chr, list(range(0, 0xd800)) +
2191                             list(range(0xe000, 0x110000))))
2192        for encoding in ('utf-8',):
2193            self.assertEqual(str(u.encode(encoding),encoding), u)
2194
2195    def test_codecs_charmap(self):
2196        # 0-127
2197        s = bytes(range(128))
2198        for encoding in (
2199            'cp037', 'cp1026', 'cp273',
2200            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2201            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2202            'cp863', 'cp865', 'cp866', 'cp1125',
2203            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2204            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2205            'iso8859_7', 'iso8859_9',
2206            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2207            'mac_cyrillic', 'mac_latin2',
2208
2209            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2210            'cp1256', 'cp1257', 'cp1258',
2211            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2212
2213            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2214            'cp1006', 'iso8859_8',
2215
2216            ### These have undefined mappings:
2217            #'cp424',
2218
2219            ### These fail the round-trip:
2220            #'cp875'
2221
2222            ):
2223            self.assertEqual(str(s, encoding).encode(encoding), s)
2224
2225        # 128-255
2226        s = bytes(range(128, 256))
2227        for encoding in (
2228            'cp037', 'cp1026', 'cp273',
2229            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2230            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2231            'cp863', 'cp865', 'cp866', 'cp1125',
2232            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2233            'iso8859_2', 'iso8859_4', 'iso8859_5',
2234            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2235            'mac_cyrillic', 'mac_latin2',
2236
2237            ### These have undefined mappings:
2238            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2239            #'cp1256', 'cp1257', 'cp1258',
2240            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2241            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2242            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2243
2244            ### These fail the round-trip:
2245            #'cp1006', 'cp875', 'iso8859_8',
2246
2247            ):
2248            self.assertEqual(str(s, encoding).encode(encoding), s)
2249
2250    def test_concatenation(self):
2251        self.assertEqual(("abc" "def"), "abcdef")
2252        self.assertEqual(("abc" "def"), "abcdef")
2253        self.assertEqual(("abc" "def"), "abcdef")
2254        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2255        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2256
2257    def test_ucs4(self):
2258        x = '\U00100000'
2259        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2260        self.assertEqual(x, y)
2261
2262        y = br'\U00100000'
2263        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2264        self.assertEqual(x, y)
2265        y = br'\U00010000'
2266        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2267        self.assertEqual(x, y)
2268
2269        try:
2270            br'\U11111111'.decode("raw-unicode-escape")
2271        except UnicodeDecodeError as e:
2272            self.assertEqual(e.start, 0)
2273            self.assertEqual(e.end, 10)
2274        else:
2275            self.fail("Should have raised UnicodeDecodeError")
2276
2277    def test_conversion(self):
2278        # Make sure __str__() works properly
2279        class ObjectToStr:
2280            def __str__(self):
2281                return "foo"
2282
2283        class StrSubclassToStr(str):
2284            def __str__(self):
2285                return "foo"
2286
2287        class StrSubclassToStrSubclass(str):
2288            def __new__(cls, content=""):
2289                return str.__new__(cls, 2*content)
2290            def __str__(self):
2291                return self
2292
2293        self.assertEqual(str(ObjectToStr()), "foo")
2294        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2295        s = str(StrSubclassToStrSubclass("foo"))
2296        self.assertEqual(s, "foofoo")
2297        self.assertIs(type(s), StrSubclassToStrSubclass)
2298        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2299        self.assertEqual(s, "foofoo")
2300        self.assertIs(type(s), StrSubclass)
2301
2302    def test_unicode_repr(self):
2303        class s1:
2304            def __repr__(self):
2305                return '\\n'
2306
2307        class s2:
2308            def __repr__(self):
2309                return '\\n'
2310
2311        self.assertEqual(repr(s1()), '\\n')
2312        self.assertEqual(repr(s2()), '\\n')
2313
2314    def test_printable_repr(self):
2315        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2316        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2317
2318    # This test only affects 32-bit platforms because expandtabs can only take
2319    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2320    # to take a 64-bit long, this test should apply to all platforms.
2321    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2322                     'only applies to 32-bit platforms')
2323    def test_expandtabs_overflows_gracefully(self):
2324        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2325
2326    @support.cpython_only
2327    def test_expandtabs_optimization(self):
2328        s = 'abc'
2329        self.assertIs(s.expandtabs(), s)
2330
2331    def test_raiseMemError(self):
2332        if struct.calcsize('P') == 8:
2333            # 64 bits pointers
2334            ascii_struct_size = 48
2335            compact_struct_size = 72
2336        else:
2337            # 32 bits pointers
2338            ascii_struct_size = 24
2339            compact_struct_size = 36
2340
2341        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2342            code = ord(char)
2343            if code < 0x100:
2344                char_size = 1  # sizeof(Py_UCS1)
2345                struct_size = ascii_struct_size
2346            elif code < 0x10000:
2347                char_size = 2  # sizeof(Py_UCS2)
2348                struct_size = compact_struct_size
2349            else:
2350                char_size = 4  # sizeof(Py_UCS4)
2351                struct_size = compact_struct_size
2352            # Note: sys.maxsize is half of the actual max allocation because of
2353            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2354            # be allocatable, given enough memory.
2355            maxlen = ((sys.maxsize - struct_size) // char_size)
2356            alloc = lambda: char * maxlen
2357            self.assertRaises(MemoryError, alloc)
2358            self.assertRaises(MemoryError, alloc)
2359
2360    def test_format_subclass(self):
2361        class S(str):
2362            def __str__(self):
2363                return '__str__ overridden'
2364        s = S('xxx')
2365        self.assertEqual("%s" % s, '__str__ overridden')
2366        self.assertEqual("{}".format(s), '__str__ overridden')
2367
2368    def test_subclass_add(self):
2369        class S(str):
2370            def __add__(self, o):
2371                return "3"
2372        self.assertEqual(S("4") + S("5"), "3")
2373        class S(str):
2374            def __iadd__(self, o):
2375                return "3"
2376        s = S("1")
2377        s += "4"
2378        self.assertEqual(s, "3")
2379
2380    def test_getnewargs(self):
2381        text = 'abc'
2382        args = text.__getnewargs__()
2383        self.assertIsNot(args[0], text)
2384        self.assertEqual(args[0], text)
2385        self.assertEqual(len(args), 1)
2386
2387    @support.cpython_only
2388    @support.requires_legacy_unicode_capi
2389    def test_resize(self):
2390        from _testcapi import getargs_u
2391        for length in range(1, 100, 7):
2392            # generate a fresh string (refcount=1)
2393            text = 'a' * length + 'b'
2394
2395            # fill wstr internal field
2396            with self.assertWarns(DeprecationWarning):
2397                abc = getargs_u(text)
2398            self.assertEqual(abc, text)
2399
2400            # resize text: wstr field must be cleared and then recomputed
2401            text += 'c'
2402            with self.assertWarns(DeprecationWarning):
2403                abcdef = getargs_u(text)
2404            self.assertNotEqual(abc, abcdef)
2405            self.assertEqual(abcdef, text)
2406
2407    def test_compare(self):
2408        # Issue #17615
2409        N = 10
2410        ascii = 'a' * N
2411        ascii2 = 'z' * N
2412        latin = '\x80' * N
2413        latin2 = '\xff' * N
2414        bmp = '\u0100' * N
2415        bmp2 = '\uffff' * N
2416        astral = '\U00100000' * N
2417        astral2 = '\U0010ffff' * N
2418        strings = (
2419            ascii, ascii2,
2420            latin, latin2,
2421            bmp, bmp2,
2422            astral, astral2)
2423        for text1, text2 in itertools.combinations(strings, 2):
2424            equal = (text1 is text2)
2425            self.assertEqual(text1 == text2, equal)
2426            self.assertEqual(text1 != text2, not equal)
2427
2428            if equal:
2429                self.assertTrue(text1 <= text2)
2430                self.assertTrue(text1 >= text2)
2431
2432                # text1 is text2: duplicate strings to skip the "str1 == str2"
2433                # optimization in unicode_compare_eq() and really compare
2434                # character per character
2435                copy1 = duplicate_string(text1)
2436                copy2 = duplicate_string(text2)
2437                self.assertIsNot(copy1, copy2)
2438
2439                self.assertTrue(copy1 == copy2)
2440                self.assertFalse(copy1 != copy2)
2441
2442                self.assertTrue(copy1 <= copy2)
2443                self.assertTrue(copy2 >= copy2)
2444
2445        self.assertTrue(ascii < ascii2)
2446        self.assertTrue(ascii < latin)
2447        self.assertTrue(ascii < bmp)
2448        self.assertTrue(ascii < astral)
2449        self.assertFalse(ascii >= ascii2)
2450        self.assertFalse(ascii >= latin)
2451        self.assertFalse(ascii >= bmp)
2452        self.assertFalse(ascii >= astral)
2453
2454        self.assertFalse(latin < ascii)
2455        self.assertTrue(latin < latin2)
2456        self.assertTrue(latin < bmp)
2457        self.assertTrue(latin < astral)
2458        self.assertTrue(latin >= ascii)
2459        self.assertFalse(latin >= latin2)
2460        self.assertFalse(latin >= bmp)
2461        self.assertFalse(latin >= astral)
2462
2463        self.assertFalse(bmp < ascii)
2464        self.assertFalse(bmp < latin)
2465        self.assertTrue(bmp < bmp2)
2466        self.assertTrue(bmp < astral)
2467        self.assertTrue(bmp >= ascii)
2468        self.assertTrue(bmp >= latin)
2469        self.assertFalse(bmp >= bmp2)
2470        self.assertFalse(bmp >= astral)
2471
2472        self.assertFalse(astral < ascii)
2473        self.assertFalse(astral < latin)
2474        self.assertFalse(astral < bmp2)
2475        self.assertTrue(astral < astral2)
2476        self.assertTrue(astral >= ascii)
2477        self.assertTrue(astral >= latin)
2478        self.assertTrue(astral >= bmp2)
2479        self.assertFalse(astral >= astral2)
2480
2481    def test_free_after_iterating(self):
2482        support.check_free_after_iterating(self, iter, str)
2483        support.check_free_after_iterating(self, reversed, str)
2484
2485    def test_check_encoding_errors(self):
2486        # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2487        # arguments in dev mode
2488        encodings = ('ascii', 'utf8', 'latin1')
2489        invalid = 'Boom, Shaka Laka, Boom!'
2490        code = textwrap.dedent(f'''
2491            import sys
2492            encodings = {encodings!r}
2493
2494            for data in (b'', b'short string'):
2495                try:
2496                    str(data, encoding={invalid!r})
2497                except LookupError:
2498                    pass
2499                else:
2500                    sys.exit(21)
2501
2502                try:
2503                    str(data, errors={invalid!r})
2504                except LookupError:
2505                    pass
2506                else:
2507                    sys.exit(22)
2508
2509                for encoding in encodings:
2510                    try:
2511                        str(data, encoding, errors={invalid!r})
2512                    except LookupError:
2513                        pass
2514                    else:
2515                        sys.exit(22)
2516
2517            for data in ('', 'short string'):
2518                try:
2519                    data.encode(encoding={invalid!r})
2520                except LookupError:
2521                    pass
2522                else:
2523                    sys.exit(23)
2524
2525                try:
2526                    data.encode(errors={invalid!r})
2527                except LookupError:
2528                    pass
2529                else:
2530                    sys.exit(24)
2531
2532                for encoding in encodings:
2533                    try:
2534                        data.encode(encoding, errors={invalid!r})
2535                    except LookupError:
2536                        pass
2537                    else:
2538                        sys.exit(24)
2539
2540            sys.exit(10)
2541        ''')
2542        proc = assert_python_failure('-X', 'dev', '-c', code)
2543        self.assertEqual(proc.rc, 10, proc)
2544
2545
2546class CAPITest(unittest.TestCase):
2547
2548    # Test PyUnicode_FromFormat()
2549    def test_from_format(self):
2550        import_helper.import_module('ctypes')
2551        from ctypes import (
2552            c_char_p,
2553            pythonapi, py_object, sizeof,
2554            c_int, c_long, c_longlong, c_ssize_t,
2555            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2556        name = "PyUnicode_FromFormat"
2557        _PyUnicode_FromFormat = getattr(pythonapi, name)
2558        _PyUnicode_FromFormat.argtypes = (c_char_p,)
2559        _PyUnicode_FromFormat.restype = py_object
2560
2561        def PyUnicode_FromFormat(format, *args):
2562            cargs = tuple(
2563                py_object(arg) if isinstance(arg, str) else arg
2564                for arg in args)
2565            return _PyUnicode_FromFormat(format, *cargs)
2566
2567        def check_format(expected, format, *args):
2568            text = PyUnicode_FromFormat(format, *args)
2569            self.assertEqual(expected, text)
2570
2571        # ascii format, non-ascii argument
2572        check_format('ascii\x7f=unicode\xe9',
2573                     b'ascii\x7f=%U', 'unicode\xe9')
2574
2575        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2576        # raises an error
2577        self.assertRaisesRegex(ValueError,
2578            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2579            'string, got a non-ASCII byte: 0xe9$',
2580            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2581
2582        # test "%c"
2583        check_format('\uabcd',
2584                     b'%c', c_int(0xabcd))
2585        check_format('\U0010ffff',
2586                     b'%c', c_int(0x10ffff))
2587        with self.assertRaises(OverflowError):
2588            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2589        # Issue #18183
2590        check_format('\U00010000\U00100000',
2591                     b'%c%c', c_int(0x10000), c_int(0x100000))
2592
2593        # test "%"
2594        check_format('%',
2595                     b'%')
2596        check_format('%',
2597                     b'%%')
2598        check_format('%s',
2599                     b'%%s')
2600        check_format('[%]',
2601                     b'[%%]')
2602        check_format('%abc',
2603                     b'%%%s', b'abc')
2604
2605        # truncated string
2606        check_format('abc',
2607                     b'%.3s', b'abcdef')
2608        check_format('abc[\ufffd',
2609                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2610        check_format("'\\u20acABC'",
2611                     b'%A', '\u20acABC')
2612        check_format("'\\u20",
2613                     b'%.5A', '\u20acABCDEF')
2614        check_format("'\u20acABC'",
2615                     b'%R', '\u20acABC')
2616        check_format("'\u20acA",
2617                     b'%.3R', '\u20acABCDEF')
2618        check_format('\u20acAB',
2619                     b'%.3S', '\u20acABCDEF')
2620        check_format('\u20acAB',
2621                     b'%.3U', '\u20acABCDEF')
2622        check_format('\u20acAB',
2623                     b'%.3V', '\u20acABCDEF', None)
2624        check_format('abc[\ufffd',
2625                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2626
2627        # following tests comes from #7330
2628        # test width modifier and precision modifier with %S
2629        check_format("repr=  abc",
2630                     b'repr=%5S', 'abc')
2631        check_format("repr=ab",
2632                     b'repr=%.2S', 'abc')
2633        check_format("repr=   ab",
2634                     b'repr=%5.2S', 'abc')
2635
2636        # test width modifier and precision modifier with %R
2637        check_format("repr=   'abc'",
2638                     b'repr=%8R', 'abc')
2639        check_format("repr='ab",
2640                     b'repr=%.3R', 'abc')
2641        check_format("repr=  'ab",
2642                     b'repr=%5.3R', 'abc')
2643
2644        # test width modifier and precision modifier with %A
2645        check_format("repr=   'abc'",
2646                     b'repr=%8A', 'abc')
2647        check_format("repr='ab",
2648                     b'repr=%.3A', 'abc')
2649        check_format("repr=  'ab",
2650                     b'repr=%5.3A', 'abc')
2651
2652        # test width modifier and precision modifier with %s
2653        check_format("repr=  abc",
2654                     b'repr=%5s', b'abc')
2655        check_format("repr=ab",
2656                     b'repr=%.2s', b'abc')
2657        check_format("repr=   ab",
2658                     b'repr=%5.2s', b'abc')
2659
2660        # test width modifier and precision modifier with %U
2661        check_format("repr=  abc",
2662                     b'repr=%5U', 'abc')
2663        check_format("repr=ab",
2664                     b'repr=%.2U', 'abc')
2665        check_format("repr=   ab",
2666                     b'repr=%5.2U', 'abc')
2667
2668        # test width modifier and precision modifier with %V
2669        check_format("repr=  abc",
2670                     b'repr=%5V', 'abc', b'123')
2671        check_format("repr=ab",
2672                     b'repr=%.2V', 'abc', b'123')
2673        check_format("repr=   ab",
2674                     b'repr=%5.2V', 'abc', b'123')
2675        check_format("repr=  123",
2676                     b'repr=%5V', None, b'123')
2677        check_format("repr=12",
2678                     b'repr=%.2V', None, b'123')
2679        check_format("repr=   12",
2680                     b'repr=%5.2V', None, b'123')
2681
2682        # test integer formats (%i, %d, %u)
2683        check_format('010',
2684                     b'%03i', c_int(10))
2685        check_format('0010',
2686                     b'%0.4i', c_int(10))
2687        check_format('-123',
2688                     b'%i', c_int(-123))
2689        check_format('-123',
2690                     b'%li', c_long(-123))
2691        check_format('-123',
2692                     b'%lli', c_longlong(-123))
2693        check_format('-123',
2694                     b'%zi', c_ssize_t(-123))
2695
2696        check_format('-123',
2697                     b'%d', c_int(-123))
2698        check_format('-123',
2699                     b'%ld', c_long(-123))
2700        check_format('-123',
2701                     b'%lld', c_longlong(-123))
2702        check_format('-123',
2703                     b'%zd', c_ssize_t(-123))
2704
2705        check_format('123',
2706                     b'%u', c_uint(123))
2707        check_format('123',
2708                     b'%lu', c_ulong(123))
2709        check_format('123',
2710                     b'%llu', c_ulonglong(123))
2711        check_format('123',
2712                     b'%zu', c_size_t(123))
2713
2714        # test long output
2715        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2716        max_longlong = -min_longlong - 1
2717        check_format(str(min_longlong),
2718                     b'%lld', c_longlong(min_longlong))
2719        check_format(str(max_longlong),
2720                     b'%lld', c_longlong(max_longlong))
2721        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2722        check_format(str(max_ulonglong),
2723                     b'%llu', c_ulonglong(max_ulonglong))
2724        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2725
2726        # test padding (width and/or precision)
2727        check_format('123'.rjust(10, '0'),
2728                     b'%010i', c_int(123))
2729        check_format('123'.rjust(100),
2730                     b'%100i', c_int(123))
2731        check_format('123'.rjust(100, '0'),
2732                     b'%.100i', c_int(123))
2733        check_format('123'.rjust(80, '0').rjust(100),
2734                     b'%100.80i', c_int(123))
2735
2736        check_format('123'.rjust(10, '0'),
2737                     b'%010u', c_uint(123))
2738        check_format('123'.rjust(100),
2739                     b'%100u', c_uint(123))
2740        check_format('123'.rjust(100, '0'),
2741                     b'%.100u', c_uint(123))
2742        check_format('123'.rjust(80, '0').rjust(100),
2743                     b'%100.80u', c_uint(123))
2744
2745        check_format('123'.rjust(10, '0'),
2746                     b'%010x', c_int(0x123))
2747        check_format('123'.rjust(100),
2748                     b'%100x', c_int(0x123))
2749        check_format('123'.rjust(100, '0'),
2750                     b'%.100x', c_int(0x123))
2751        check_format('123'.rjust(80, '0').rjust(100),
2752                     b'%100.80x', c_int(0x123))
2753
2754        # test %A
2755        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2756                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2757
2758        # test %V
2759        check_format('repr=abc',
2760                     b'repr=%V', 'abc', b'xyz')
2761
2762        # Test string decode from parameter of %s using utf-8.
2763        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2764        # '\u4eba\u6c11'
2765        check_format('repr=\u4eba\u6c11',
2766                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2767
2768        #Test replace error handler.
2769        check_format('repr=abc\ufffd',
2770                     b'repr=%V', None, b'abc\xff')
2771
2772        # not supported: copy the raw format string. these tests are just here
2773        # to check for crashes and should not be considered as specifications
2774        check_format('%s',
2775                     b'%1%s', b'abc')
2776        check_format('%1abc',
2777                     b'%1abc')
2778        check_format('%+i',
2779                     b'%+i', c_int(10))
2780        check_format('%.%s',
2781                     b'%.%s', b'abc')
2782
2783        # Issue #33817: empty strings
2784        check_format('',
2785                     b'')
2786        check_format('',
2787                     b'%s', b'')
2788
2789    # Test PyUnicode_AsWideChar()
2790    @support.cpython_only
2791    def test_aswidechar(self):
2792        from _testcapi import unicode_aswidechar
2793        import_helper.import_module('ctypes')
2794        from ctypes import c_wchar, sizeof
2795
2796        wchar, size = unicode_aswidechar('abcdef', 2)
2797        self.assertEqual(size, 2)
2798        self.assertEqual(wchar, 'ab')
2799
2800        wchar, size = unicode_aswidechar('abc', 3)
2801        self.assertEqual(size, 3)
2802        self.assertEqual(wchar, 'abc')
2803
2804        wchar, size = unicode_aswidechar('abc', 4)
2805        self.assertEqual(size, 3)
2806        self.assertEqual(wchar, 'abc\0')
2807
2808        wchar, size = unicode_aswidechar('abc', 10)
2809        self.assertEqual(size, 3)
2810        self.assertEqual(wchar, 'abc\0')
2811
2812        wchar, size = unicode_aswidechar('abc\0def', 20)
2813        self.assertEqual(size, 7)
2814        self.assertEqual(wchar, 'abc\0def\0')
2815
2816        nonbmp = chr(0x10ffff)
2817        if sizeof(c_wchar) == 2:
2818            buflen = 3
2819            nchar = 2
2820        else: # sizeof(c_wchar) == 4
2821            buflen = 2
2822            nchar = 1
2823        wchar, size = unicode_aswidechar(nonbmp, buflen)
2824        self.assertEqual(size, nchar)
2825        self.assertEqual(wchar, nonbmp + '\0')
2826
2827    # Test PyUnicode_AsWideCharString()
2828    @support.cpython_only
2829    def test_aswidecharstring(self):
2830        from _testcapi import unicode_aswidecharstring
2831        import_helper.import_module('ctypes')
2832        from ctypes import c_wchar, sizeof
2833
2834        wchar, size = unicode_aswidecharstring('abc')
2835        self.assertEqual(size, 3)
2836        self.assertEqual(wchar, 'abc\0')
2837
2838        wchar, size = unicode_aswidecharstring('abc\0def')
2839        self.assertEqual(size, 7)
2840        self.assertEqual(wchar, 'abc\0def\0')
2841
2842        nonbmp = chr(0x10ffff)
2843        if sizeof(c_wchar) == 2:
2844            nchar = 2
2845        else: # sizeof(c_wchar) == 4
2846            nchar = 1
2847        wchar, size = unicode_aswidecharstring(nonbmp)
2848        self.assertEqual(size, nchar)
2849        self.assertEqual(wchar, nonbmp + '\0')
2850
2851    # Test PyUnicode_AsUCS4()
2852    @support.cpython_only
2853    def test_asucs4(self):
2854        from _testcapi import unicode_asucs4
2855        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2856                  'a\ud800b\udfffc', '\ud834\udd1e']:
2857            l = len(s)
2858            self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
2859            self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
2860            self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
2861            self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
2862            self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
2863            self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
2864            s = '\0'.join([s, s])
2865            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
2866            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
2867
2868    # Test PyUnicode_AsUTF8()
2869    @support.cpython_only
2870    def test_asutf8(self):
2871        from _testcapi import unicode_asutf8
2872
2873        bmp = '\u0100'
2874        bmp2 = '\uffff'
2875        nonbmp = chr(0x10ffff)
2876
2877        self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
2878        self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
2879        self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
2880        self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
2881
2882    # Test PyUnicode_AsUTF8AndSize()
2883    @support.cpython_only
2884    def test_asutf8andsize(self):
2885        from _testcapi import unicode_asutf8andsize
2886
2887        bmp = '\u0100'
2888        bmp2 = '\uffff'
2889        nonbmp = chr(0x10ffff)
2890
2891        self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
2892        self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
2893        self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
2894        self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
2895
2896    # Test PyUnicode_FindChar()
2897    @support.cpython_only
2898    def test_findchar(self):
2899        from _testcapi import unicode_findchar
2900
2901        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2902            for i, ch in enumerate(str):
2903                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2904                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2905
2906        str = "!>_<!"
2907        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2908        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2909        # start < end
2910        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2911        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2912        # start >= end
2913        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2914        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2915        # negative
2916        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2917        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2918
2919    # Test PyUnicode_CopyCharacters()
2920    @support.cpython_only
2921    def test_copycharacters(self):
2922        from _testcapi import unicode_copycharacters
2923
2924        strings = [
2925            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2926            '\u4f60\u597d\u4e16\u754c\uff01',
2927            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2928        ]
2929
2930        for idx, from_ in enumerate(strings):
2931            # wide -> narrow: exceed maxchar limitation
2932            for to in strings[:idx]:
2933                self.assertRaises(
2934                    SystemError,
2935                    unicode_copycharacters, to, 0, from_, 0, 5
2936                )
2937            # same kind
2938            for from_start in range(5):
2939                self.assertEqual(
2940                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2941                    (from_[from_start:from_start+5].ljust(5, '\0'),
2942                     5-from_start)
2943                )
2944            for to_start in range(5):
2945                self.assertEqual(
2946                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2947                    (from_[to_start:to_start+5].rjust(5, '\0'),
2948                     5-to_start)
2949                )
2950            # narrow -> wide
2951            # Tests omitted since this creates invalid strings.
2952
2953        s = strings[0]
2954        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2955        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2956        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2957        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2958        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2959        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2960        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2961
2962    @support.cpython_only
2963    @support.requires_legacy_unicode_capi
2964    def test_encode_decimal(self):
2965        from _testcapi import unicode_encodedecimal
2966        with warnings_helper.check_warnings():
2967            warnings.simplefilter('ignore', DeprecationWarning)
2968            self.assertEqual(unicode_encodedecimal('123'),
2969                             b'123')
2970            self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2971                             b'3.14')
2972            self.assertEqual(unicode_encodedecimal(
2973                             "\N{EM SPACE}3.14\N{EN SPACE}"), b' 3.14 ')
2974            self.assertRaises(UnicodeEncodeError,
2975                              unicode_encodedecimal, "123\u20ac", "strict")
2976            self.assertRaisesRegex(
2977                ValueError,
2978                "^'decimal' codec can't encode character",
2979                unicode_encodedecimal, "123\u20ac", "replace")
2980
2981    @support.cpython_only
2982    @support.requires_legacy_unicode_capi
2983    def test_transform_decimal(self):
2984        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2985        with warnings_helper.check_warnings():
2986            warnings.simplefilter('ignore', DeprecationWarning)
2987            self.assertEqual(transform_decimal('123'),
2988                             '123')
2989            self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2990                             '3.14')
2991            self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2992                             "\N{EM SPACE}3.14\N{EN SPACE}")
2993            self.assertEqual(transform_decimal('123\u20ac'),
2994                             '123\u20ac')
2995
2996    @support.cpython_only
2997    def test_pep393_utf8_caching_bug(self):
2998        # Issue #25709: Problem with string concatenation and utf-8 cache
2999        from _testcapi import getargs_s_hash
3000        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
3001            s = ''
3002            for i in range(5):
3003                # Due to CPython specific optimization the 's' string can be
3004                # resized in-place.
3005                s += chr(k)
3006                # Parsing with the "s#" format code calls indirectly
3007                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
3008                # encoded string cached in the Unicode object.
3009                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
3010                # Check that the second call returns the same result
3011                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
3012
3013class StringModuleTest(unittest.TestCase):
3014    def test_formatter_parser(self):
3015        def parse(format):
3016            return list(_string.formatter_parser(format))
3017
3018        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
3019        self.assertEqual(formatter, [
3020            ('prefix ', '2', '', 's'),
3021            ('xxx', '0', '^+10.3f', None),
3022            ('', 'obj.attr', '', 's'),
3023            (' ', 'z[0]', '10', 's'),
3024        ])
3025
3026        formatter = parse("prefix {} suffix")
3027        self.assertEqual(formatter, [
3028            ('prefix ', '', '', None),
3029            (' suffix', None, None, None),
3030        ])
3031
3032        formatter = parse("str")
3033        self.assertEqual(formatter, [
3034            ('str', None, None, None),
3035        ])
3036
3037        formatter = parse("")
3038        self.assertEqual(formatter, [])
3039
3040        formatter = parse("{0}")
3041        self.assertEqual(formatter, [
3042            ('', '0', '', None),
3043        ])
3044
3045        self.assertRaises(TypeError, _string.formatter_parser, 1)
3046
3047    def test_formatter_field_name_split(self):
3048        def split(name):
3049            items = list(_string.formatter_field_name_split(name))
3050            items[1] = list(items[1])
3051            return items
3052        self.assertEqual(split("obj"), ["obj", []])
3053        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
3054        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
3055        self.assertEqual(split("obj.arg[key1][key2]"), [
3056            "obj",
3057            [(True, 'arg'),
3058             (False, 'key1'),
3059             (False, 'key2'),
3060            ]])
3061        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
3062
3063
3064if __name__ == "__main__":
3065    unittest.main()
3066