• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import sys
14import textwrap
15import unicodedata
16import unittest
17import warnings
18from test import support, string_tests
19from test.support.script_helper import assert_python_failure
20
21# Error handling (bad decoder return)
22def search_function(encoding):
23    def decode1(input, errors="strict"):
24        return 42 # not a tuple
25    def encode1(input, errors="strict"):
26        return 42 # not a tuple
27    def encode2(input, errors="strict"):
28        return (42, 42) # no unicode
29    def decode2(input, errors="strict"):
30        return (42, 42) # no unicode
31    if encoding=="test.unicode1":
32        return (encode1, decode1, None, None)
33    elif encoding=="test.unicode2":
34        return (encode2, decode2, None, None)
35    else:
36        return None
37codecs.register(search_function)
38
39def duplicate_string(text):
40    """
41    Try to get a fresh clone of the specified text:
42    new object with a reference count of 1.
43
44    This is a best-effort: latin1 single letters and the empty
45    string ('') are singletons and cannot be cloned.
46    """
47    return text.encode().decode()
48
49class StrSubclass(str):
50    pass
51
52class UnicodeTest(string_tests.CommonTest,
53        string_tests.MixinStrUnicodeUserStringTest,
54        string_tests.MixinStrUnicodeTest,
55        unittest.TestCase):
56
57    type2test = str
58
59    def checkequalnofix(self, result, object, methodname, *args):
60        method = getattr(object, methodname)
61        realresult = method(*args)
62        self.assertEqual(realresult, result)
63        self.assertTrue(type(realresult) is type(result))
64
65        # if the original is returned make sure that
66        # this doesn't happen with subclasses
67        if realresult is object:
68            class usub(str):
69                def __repr__(self):
70                    return 'usub(%r)' % str.__repr__(self)
71            object = usub(object)
72            method = getattr(object, methodname)
73            realresult = method(*args)
74            self.assertEqual(realresult, result)
75            self.assertTrue(object is not realresult)
76
77    def test_literals(self):
78        self.assertEqual('\xff', '\u00ff')
79        self.assertEqual('\uffff', '\U0000ffff')
80        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
81        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
82        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
83        # raw strings should not have unicode escapes
84        self.assertNotEqual(r"\u0020", " ")
85
86    def test_ascii(self):
87        if not sys.platform.startswith('java'):
88            # Test basic sanity of repr()
89            self.assertEqual(ascii('abc'), "'abc'")
90            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
91            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
92            self.assertEqual(ascii('\\c'), "'\\\\c'")
93            self.assertEqual(ascii('\\'), "'\\\\'")
94            self.assertEqual(ascii('\n'), "'\\n'")
95            self.assertEqual(ascii('\r'), "'\\r'")
96            self.assertEqual(ascii('\t'), "'\\t'")
97            self.assertEqual(ascii('\b'), "'\\x08'")
98            self.assertEqual(ascii("'\""), """'\\'"'""")
99            self.assertEqual(ascii("'\""), """'\\'"'""")
100            self.assertEqual(ascii("'"), '''"'"''')
101            self.assertEqual(ascii('"'), """'"'""")
102            latin1repr = (
103                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
104                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
105                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
106                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
107                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
108                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
109                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
110                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
111                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
112                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
113                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
114                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
115                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
116                "\\xfe\\xff'")
117            testrepr = ascii(''.join(map(chr, range(256))))
118            self.assertEqual(testrepr, latin1repr)
119            # Test ascii works on wide unicode escapes without overflow.
120            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
121                             ascii("\U00010000" * 39 + "\uffff" * 4096))
122
123            class WrongRepr:
124                def __repr__(self):
125                    return b'byte-repr'
126            self.assertRaises(TypeError, ascii, WrongRepr())
127
128    def test_repr(self):
129        if not sys.platform.startswith('java'):
130            # Test basic sanity of repr()
131            self.assertEqual(repr('abc'), "'abc'")
132            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
133            self.assertEqual(repr('ab\\'), "'ab\\\\'")
134            self.assertEqual(repr('\\c'), "'\\\\c'")
135            self.assertEqual(repr('\\'), "'\\\\'")
136            self.assertEqual(repr('\n'), "'\\n'")
137            self.assertEqual(repr('\r'), "'\\r'")
138            self.assertEqual(repr('\t'), "'\\t'")
139            self.assertEqual(repr('\b'), "'\\x08'")
140            self.assertEqual(repr("'\""), """'\\'"'""")
141            self.assertEqual(repr("'\""), """'\\'"'""")
142            self.assertEqual(repr("'"), '''"'"''')
143            self.assertEqual(repr('"'), """'"'""")
144            latin1repr = (
145                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
146                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
147                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
148                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
149                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
150                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
151                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
152                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
153                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
154                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
155                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
156                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
157                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
158                "\xfe\xff'")
159            testrepr = repr(''.join(map(chr, range(256))))
160            self.assertEqual(testrepr, latin1repr)
161            # Test repr works on wide unicode escapes without overflow.
162            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
163                             repr("\U00010000" * 39 + "\uffff" * 4096))
164
165            class WrongRepr:
166                def __repr__(self):
167                    return b'byte-repr'
168            self.assertRaises(TypeError, repr, WrongRepr())
169
170    def test_iterators(self):
171        # Make sure unicode objects have an __iter__ method
172        it = "\u1111\u2222\u3333".__iter__()
173        self.assertEqual(next(it), "\u1111")
174        self.assertEqual(next(it), "\u2222")
175        self.assertEqual(next(it), "\u3333")
176        self.assertRaises(StopIteration, next, it)
177
178    def test_count(self):
179        string_tests.CommonTest.test_count(self)
180        # check mixed argument types
181        self.checkequalnofix(3,  'aaa', 'count', 'a')
182        self.checkequalnofix(0,  'aaa', 'count', 'b')
183        self.checkequalnofix(3, 'aaa', 'count',  'a')
184        self.checkequalnofix(0, 'aaa', 'count',  'b')
185        self.checkequalnofix(0, 'aaa', 'count',  'b')
186        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
187        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
188        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
189        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
190        # test mixed kinds
191        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
192        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
193        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
194        self.checkequal(0, 'a' * 10, 'count', '\u0102')
195        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
196        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
197        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
198        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
199        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
200        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
201        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
202        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
203
204    def test_find(self):
205        string_tests.CommonTest.test_find(self)
206        # test implementation details of the memchr fast path
207        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
208        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
209        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
210        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
211        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
212        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
213        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
214        # check mixed argument types
215        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
216        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
217        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
218
219        self.assertRaises(TypeError, 'hello'.find)
220        self.assertRaises(TypeError, 'hello'.find, 42)
221        # test mixed kinds
222        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
223        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
224        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
225        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
226        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
227        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
228        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
229        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
230        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
231        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
232        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
233        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
234
235    def test_rfind(self):
236        string_tests.CommonTest.test_rfind(self)
237        # test implementation details of the memrchr fast path
238        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
239        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
240        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
241        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
242        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
243        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
244        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
245        # check mixed argument types
246        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
247        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
248        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
249        # test mixed kinds
250        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
251        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
252        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
253        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
254        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
255        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
256        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
257        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
258        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
259        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
260        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
261        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
262
263    def test_index(self):
264        string_tests.CommonTest.test_index(self)
265        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
266        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
267        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
268        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
269        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
270        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
271        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
272        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
273        # test mixed kinds
274        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
275        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
276        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
277        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
278        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
279        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
280        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
281        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
282        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
283        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
284        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
285        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
286
287    def test_rindex(self):
288        string_tests.CommonTest.test_rindex(self)
289        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
290        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
291        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
292        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
293
294        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
295        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
296        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
297        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
298        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
299        # test mixed kinds
300        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
301        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
302        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
303        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
304        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
305        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
306        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
307        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
308        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
309        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
310        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
311        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
312
313    def test_maketrans_translate(self):
314        # these work with plain translate()
315        self.checkequalnofix('bbbc', 'abababc', 'translate',
316                             {ord('a'): None})
317        self.checkequalnofix('iiic', 'abababc', 'translate',
318                             {ord('a'): None, ord('b'): ord('i')})
319        self.checkequalnofix('iiix', 'abababc', 'translate',
320                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
321        self.checkequalnofix('c', 'abababc', 'translate',
322                             {ord('a'): None, ord('b'): ''})
323        self.checkequalnofix('xyyx', 'xzx', 'translate',
324                             {ord('z'): 'yy'})
325
326        # this needs maketrans()
327        self.checkequalnofix('abababc', 'abababc', 'translate',
328                             {'b': '<i>'})
329        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
330        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
331        # test alternative way of calling maketrans()
332        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
333        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
334
335        # various tests switching from ASCII to latin1 or the opposite;
336        # same length, remove a letter, or replace with a longer string.
337        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
338                         "[X]")
339        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
340                         "[X]")
341        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
342                         "[]")
343        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
344                         "[XXX]")
345        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
346                         "[\xe9]")
347        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
348                         "x123")
349        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
350                         "x\xe9")
351
352        # test non-ASCII (don't take the fast-path)
353        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
354                         "[<\xe9>]")
355        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
356                         "[a]")
357        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
358                         "[]")
359        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
360                         "[123]")
361        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
362                         "[<\u20ac>\xe9]")
363
364        # invalid Unicode characters
365        invalid_char = 0x10ffff+1
366        for before in "a\xe9\u20ac\U0010ffff":
367            mapping = str.maketrans({before: invalid_char})
368            text = "[%s]" % before
369            self.assertRaises(ValueError, text.translate, mapping)
370
371        # errors
372        self.assertRaises(TypeError, self.type2test.maketrans)
373        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
374        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
375        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
376        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
377        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
378        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
379
380        self.assertRaises(TypeError, 'hello'.translate)
381        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
382
383    def test_split(self):
384        string_tests.CommonTest.test_split(self)
385
386        # test mixed kinds
387        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
388            left *= 9
389            right *= 9
390            for delim in ('c', '\u0102', '\U00010302'):
391                self.checkequal([left + right],
392                                left + right, 'split', delim)
393                self.checkequal([left, right],
394                                left + delim + right, 'split', delim)
395                self.checkequal([left + right],
396                                left + right, 'split', delim * 2)
397                self.checkequal([left, right],
398                                left + delim * 2 + right, 'split', delim *2)
399
400    def test_rsplit(self):
401        string_tests.CommonTest.test_rsplit(self)
402        # test mixed kinds
403        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
404            left *= 9
405            right *= 9
406            for delim in ('c', '\u0102', '\U00010302'):
407                self.checkequal([left + right],
408                                left + right, 'rsplit', delim)
409                self.checkequal([left, right],
410                                left + delim + right, 'rsplit', delim)
411                self.checkequal([left + right],
412                                left + right, 'rsplit', delim * 2)
413                self.checkequal([left, right],
414                                left + delim * 2 + right, 'rsplit', delim *2)
415
416    def test_partition(self):
417        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
418        # test mixed kinds
419        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
420        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
421            left *= 9
422            right *= 9
423            for delim in ('c', '\u0102', '\U00010302'):
424                self.checkequal((left + right, '', ''),
425                                left + right, 'partition', delim)
426                self.checkequal((left, delim, right),
427                                left + delim + right, 'partition', delim)
428                self.checkequal((left + right, '', ''),
429                                left + right, 'partition', delim * 2)
430                self.checkequal((left, delim * 2, right),
431                                left + delim * 2 + right, 'partition', delim * 2)
432
433    def test_rpartition(self):
434        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
435        # test mixed kinds
436        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
437        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
438            left *= 9
439            right *= 9
440            for delim in ('c', '\u0102', '\U00010302'):
441                self.checkequal(('', '', left + right),
442                                left + right, 'rpartition', delim)
443                self.checkequal((left, delim, right),
444                                left + delim + right, 'rpartition', delim)
445                self.checkequal(('', '', left + right),
446                                left + right, 'rpartition', delim * 2)
447                self.checkequal((left, delim * 2, right),
448                                left + delim * 2 + right, 'rpartition', delim * 2)
449
450    def test_join(self):
451        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
452
453        class MyWrapper:
454            def __init__(self, sval): self.sval = sval
455            def __str__(self): return self.sval
456
457        # mixed arguments
458        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
459        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
460        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
461        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
462        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
463        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
464        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
465        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
466        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
467        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
468        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
469
470    @unittest.skipIf(sys.maxsize > 2**32,
471        'needs too much memory on a 64-bit platform')
472    def test_join_overflow(self):
473        size = int(sys.maxsize**0.5) + 1
474        seq = ('A' * size,) * size
475        self.assertRaises(OverflowError, ''.join, seq)
476
477    def test_replace(self):
478        string_tests.CommonTest.test_replace(self)
479
480        # method call forwarded from str implementation because of unicode argument
481        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
482        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
483        # test mixed kinds
484        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
485            left *= 9
486            right *= 9
487            for delim in ('c', '\u0102', '\U00010302'):
488                for repl in ('d', '\u0103', '\U00010303'):
489                    self.checkequal(left + right,
490                                    left + right, 'replace', delim, repl)
491                    self.checkequal(left + repl + right,
492                                    left + delim + right,
493                                    'replace', delim, repl)
494                    self.checkequal(left + right,
495                                    left + right, 'replace', delim * 2, repl)
496                    self.checkequal(left + repl + right,
497                                    left + delim * 2 + right,
498                                    'replace', delim * 2, repl)
499
500    @support.cpython_only
501    def test_replace_id(self):
502        pattern = 'abc'
503        text = 'abc def'
504        self.assertIs(text.replace(pattern, pattern), text)
505
506    def test_bytes_comparison(self):
507        with support.check_warnings():
508            warnings.simplefilter('ignore', BytesWarning)
509            self.assertEqual('abc' == b'abc', False)
510            self.assertEqual('abc' != b'abc', True)
511            self.assertEqual('abc' == bytearray(b'abc'), False)
512            self.assertEqual('abc' != bytearray(b'abc'), True)
513
514    def test_comparison(self):
515        # Comparisons:
516        self.assertEqual('abc', 'abc')
517        self.assertTrue('abcd' > 'abc')
518        self.assertTrue('abc' < 'abcd')
519
520        if 0:
521            # Move these tests to a Unicode collation module test...
522            # Testing UTF-16 code point order comparisons...
523
524            # No surrogates, no fixup required.
525            self.assertTrue('\u0061' < '\u20ac')
526            # Non surrogate below surrogate value, no fixup required
527            self.assertTrue('\u0061' < '\ud800\udc02')
528
529            # Non surrogate above surrogate value, fixup required
530            def test_lecmp(s, s2):
531                self.assertTrue(s < s2)
532
533            def test_fixup(s):
534                s2 = '\ud800\udc01'
535                test_lecmp(s, s2)
536                s2 = '\ud900\udc01'
537                test_lecmp(s, s2)
538                s2 = '\uda00\udc01'
539                test_lecmp(s, s2)
540                s2 = '\udb00\udc01'
541                test_lecmp(s, s2)
542                s2 = '\ud800\udd01'
543                test_lecmp(s, s2)
544                s2 = '\ud900\udd01'
545                test_lecmp(s, s2)
546                s2 = '\uda00\udd01'
547                test_lecmp(s, s2)
548                s2 = '\udb00\udd01'
549                test_lecmp(s, s2)
550                s2 = '\ud800\ude01'
551                test_lecmp(s, s2)
552                s2 = '\ud900\ude01'
553                test_lecmp(s, s2)
554                s2 = '\uda00\ude01'
555                test_lecmp(s, s2)
556                s2 = '\udb00\ude01'
557                test_lecmp(s, s2)
558                s2 = '\ud800\udfff'
559                test_lecmp(s, s2)
560                s2 = '\ud900\udfff'
561                test_lecmp(s, s2)
562                s2 = '\uda00\udfff'
563                test_lecmp(s, s2)
564                s2 = '\udb00\udfff'
565                test_lecmp(s, s2)
566
567                test_fixup('\ue000')
568                test_fixup('\uff61')
569
570        # Surrogates on both sides, no fixup required
571        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
572
573    def test_islower(self):
574        super().test_islower()
575        self.checkequalnofix(False, '\u1FFc', 'islower')
576        self.assertFalse('\u2167'.islower())
577        self.assertTrue('\u2177'.islower())
578        # non-BMP, uppercase
579        self.assertFalse('\U00010401'.islower())
580        self.assertFalse('\U00010427'.islower())
581        # non-BMP, lowercase
582        self.assertTrue('\U00010429'.islower())
583        self.assertTrue('\U0001044E'.islower())
584        # non-BMP, non-cased
585        self.assertFalse('\U0001F40D'.islower())
586        self.assertFalse('\U0001F46F'.islower())
587
588    def test_isupper(self):
589        super().test_isupper()
590        if not sys.platform.startswith('java'):
591            self.checkequalnofix(False, '\u1FFc', 'isupper')
592        self.assertTrue('\u2167'.isupper())
593        self.assertFalse('\u2177'.isupper())
594        # non-BMP, uppercase
595        self.assertTrue('\U00010401'.isupper())
596        self.assertTrue('\U00010427'.isupper())
597        # non-BMP, lowercase
598        self.assertFalse('\U00010429'.isupper())
599        self.assertFalse('\U0001044E'.isupper())
600        # non-BMP, non-cased
601        self.assertFalse('\U0001F40D'.isupper())
602        self.assertFalse('\U0001F46F'.isupper())
603
604    def test_istitle(self):
605        super().test_istitle()
606        self.checkequalnofix(True, '\u1FFc', 'istitle')
607        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
608
609        # non-BMP, uppercase + lowercase
610        self.assertTrue('\U00010401\U00010429'.istitle())
611        self.assertTrue('\U00010427\U0001044E'.istitle())
612        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
613        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
614            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
615
616    def test_isspace(self):
617        super().test_isspace()
618        self.checkequalnofix(True, '\u2000', 'isspace')
619        self.checkequalnofix(True, '\u200a', 'isspace')
620        self.checkequalnofix(False, '\u2014', 'isspace')
621        # There are no non-BMP whitespace chars as of Unicode 12.
622        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
623                   '\U0001F40D', '\U0001F46F']:
624            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
625
626    @support.requires_resource('cpu')
627    def test_isspace_invariant(self):
628        for codepoint in range(sys.maxunicode + 1):
629            char = chr(codepoint)
630            bidirectional = unicodedata.bidirectional(char)
631            category = unicodedata.category(char)
632            self.assertEqual(char.isspace(),
633                             (bidirectional in ('WS', 'B', 'S')
634                              or category == 'Zs'))
635
636    def test_isalnum(self):
637        super().test_isalnum()
638        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
639                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
640            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
641
642    def test_isalpha(self):
643        super().test_isalpha()
644        self.checkequalnofix(True, '\u1FFc', 'isalpha')
645        # non-BMP, cased
646        self.assertTrue('\U00010401'.isalpha())
647        self.assertTrue('\U00010427'.isalpha())
648        self.assertTrue('\U00010429'.isalpha())
649        self.assertTrue('\U0001044E'.isalpha())
650        # non-BMP, non-cased
651        self.assertFalse('\U0001F40D'.isalpha())
652        self.assertFalse('\U0001F46F'.isalpha())
653
654    def test_isascii(self):
655        super().test_isascii()
656        self.assertFalse("\u20ac".isascii())
657        self.assertFalse("\U0010ffff".isascii())
658
659    def test_isdecimal(self):
660        self.checkequalnofix(False, '', 'isdecimal')
661        self.checkequalnofix(False, 'a', 'isdecimal')
662        self.checkequalnofix(True, '0', 'isdecimal')
663        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
664        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
665        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
666        self.checkequalnofix(True, '0123456789', 'isdecimal')
667        self.checkequalnofix(False, '0123456789a', 'isdecimal')
668
669        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
670
671        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
672                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
673            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
674        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
675            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
676
677    def test_isdigit(self):
678        super().test_isdigit()
679        self.checkequalnofix(True, '\u2460', 'isdigit')
680        self.checkequalnofix(False, '\xbc', 'isdigit')
681        self.checkequalnofix(True, '\u0660', 'isdigit')
682
683        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
684                   '\U0001F40D', '\U0001F46F', '\U00011065']:
685            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
686        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
687            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
688
689    def test_isnumeric(self):
690        self.checkequalnofix(False, '', 'isnumeric')
691        self.checkequalnofix(False, 'a', 'isnumeric')
692        self.checkequalnofix(True, '0', 'isnumeric')
693        self.checkequalnofix(True, '\u2460', 'isnumeric')
694        self.checkequalnofix(True, '\xbc', 'isnumeric')
695        self.checkequalnofix(True, '\u0660', 'isnumeric')
696        self.checkequalnofix(True, '0123456789', 'isnumeric')
697        self.checkequalnofix(False, '0123456789a', 'isnumeric')
698
699        self.assertRaises(TypeError, "abc".isnumeric, 42)
700
701        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
702                   '\U0001F40D', '\U0001F46F']:
703            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
704        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
705                   '\U000104A0', '\U0001F107']:
706            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
707
708    def test_isidentifier(self):
709        self.assertTrue("a".isidentifier())
710        self.assertTrue("Z".isidentifier())
711        self.assertTrue("_".isidentifier())
712        self.assertTrue("b0".isidentifier())
713        self.assertTrue("bc".isidentifier())
714        self.assertTrue("b_".isidentifier())
715        self.assertTrue("µ".isidentifier())
716        self.assertTrue("��������������".isidentifier())
717
718        self.assertFalse(" ".isidentifier())
719        self.assertFalse("[".isidentifier())
720        self.assertFalse("©".isidentifier())
721        self.assertFalse("0".isidentifier())
722
723    @support.cpython_only
724    def test_isidentifier_legacy(self):
725        import _testcapi
726        u = '��������������'
727        self.assertTrue(u.isidentifier())
728        self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
729
730    def test_isprintable(self):
731        self.assertTrue("".isprintable())
732        self.assertTrue(" ".isprintable())
733        self.assertTrue("abcdefg".isprintable())
734        self.assertFalse("abcdefg\n".isprintable())
735        # some defined Unicode character
736        self.assertTrue("\u0374".isprintable())
737        # undefined character
738        self.assertFalse("\u0378".isprintable())
739        # single surrogate character
740        self.assertFalse("\ud800".isprintable())
741
742        self.assertTrue('\U0001F46F'.isprintable())
743        self.assertFalse('\U000E0020'.isprintable())
744
745    def test_surrogates(self):
746        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
747                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
748            self.assertTrue(s.islower())
749            self.assertFalse(s.isupper())
750            self.assertFalse(s.istitle())
751        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
752                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
753            self.assertFalse(s.islower())
754            self.assertTrue(s.isupper())
755            self.assertTrue(s.istitle())
756
757        for meth_name in ('islower', 'isupper', 'istitle'):
758            meth = getattr(str, meth_name)
759            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
760                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
761
762        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
763                          'isdecimal', 'isnumeric',
764                          'isidentifier', 'isprintable'):
765            meth = getattr(str, meth_name)
766            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
767                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
768                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
769                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
770
771
772    def test_lower(self):
773        string_tests.CommonTest.test_lower(self)
774        self.assertEqual('\U00010427'.lower(), '\U0001044F')
775        self.assertEqual('\U00010427\U00010427'.lower(),
776                         '\U0001044F\U0001044F')
777        self.assertEqual('\U00010427\U0001044F'.lower(),
778                         '\U0001044F\U0001044F')
779        self.assertEqual('X\U00010427x\U0001044F'.lower(),
780                         'x\U0001044Fx\U0001044F')
781        self.assertEqual('fi'.lower(), 'fi')
782        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
783        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
784        self.assertEqual('\u03a3'.lower(), '\u03c3')
785        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
786        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
787        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
788        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
789        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
790        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
791        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
792        self.assertEqual('\u2177'.lower(), '\u2177')
793
794    def test_casefold(self):
795        self.assertEqual('hello'.casefold(), 'hello')
796        self.assertEqual('hELlo'.casefold(), 'hello')
797        self.assertEqual('ß'.casefold(), 'ss')
798        self.assertEqual('fi'.casefold(), 'fi')
799        self.assertEqual('\u03a3'.casefold(), '\u03c3')
800        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
801        self.assertEqual('\u00b5'.casefold(), '\u03bc')
802
803    def test_upper(self):
804        string_tests.CommonTest.test_upper(self)
805        self.assertEqual('\U0001044F'.upper(), '\U00010427')
806        self.assertEqual('\U0001044F\U0001044F'.upper(),
807                         '\U00010427\U00010427')
808        self.assertEqual('\U00010427\U0001044F'.upper(),
809                         '\U00010427\U00010427')
810        self.assertEqual('X\U00010427x\U0001044F'.upper(),
811                         'X\U00010427X\U00010427')
812        self.assertEqual('fi'.upper(), 'FI')
813        self.assertEqual('\u0130'.upper(), '\u0130')
814        self.assertEqual('\u03a3'.upper(), '\u03a3')
815        self.assertEqual('ß'.upper(), 'SS')
816        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
817        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
818        self.assertEqual('\u2177'.upper(), '\u2167')
819
820    def test_capitalize(self):
821        string_tests.CommonTest.test_capitalize(self)
822        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
823        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
824                         '\U00010427\U0001044F')
825        self.assertEqual('\U00010427\U0001044F'.capitalize(),
826                         '\U00010427\U0001044F')
827        self.assertEqual('\U0001044F\U00010427'.capitalize(),
828                         '\U00010427\U0001044F')
829        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
830                         'X\U0001044Fx\U0001044F')
831        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
832        exp = '\u0399\u0308\u0300\u0069\u0307'
833        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
834        self.assertEqual('finnish'.capitalize(), 'Finnish')
835        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
836
837    def test_title(self):
838        super().test_title()
839        self.assertEqual('\U0001044F'.title(), '\U00010427')
840        self.assertEqual('\U0001044F\U0001044F'.title(),
841                         '\U00010427\U0001044F')
842        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
843                         '\U00010427\U0001044F \U00010427\U0001044F')
844        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
845                         '\U00010427\U0001044F \U00010427\U0001044F')
846        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
847                         '\U00010427\U0001044F \U00010427\U0001044F')
848        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
849                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
850        self.assertEqual('fiNNISH'.title(), 'Finnish')
851        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
852        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
853
854    def test_swapcase(self):
855        string_tests.CommonTest.test_swapcase(self)
856        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
857        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
858        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
859                         '\U00010427\U00010427')
860        self.assertEqual('\U00010427\U0001044F'.swapcase(),
861                         '\U0001044F\U00010427')
862        self.assertEqual('\U0001044F\U00010427'.swapcase(),
863                         '\U00010427\U0001044F')
864        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
865                         'x\U0001044FX\U00010427')
866        self.assertEqual('fi'.swapcase(), 'FI')
867        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
868        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
869        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
870        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
871        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
872        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
873        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
874        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
875        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
876        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
877        self.assertEqual('ß'.swapcase(), 'SS')
878        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
879
880    def test_center(self):
881        string_tests.CommonTest.test_center(self)
882        self.assertEqual('x'.center(2, '\U0010FFFF'),
883                         'x\U0010FFFF')
884        self.assertEqual('x'.center(3, '\U0010FFFF'),
885                         '\U0010FFFFx\U0010FFFF')
886        self.assertEqual('x'.center(4, '\U0010FFFF'),
887                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
888
889    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
890    @support.cpython_only
891    def test_case_operation_overflow(self):
892        # Issue #22643
893        size = 2**32//12 + 1
894        try:
895            s = "ü" * size
896        except MemoryError:
897            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
898        try:
899            self.assertRaises(OverflowError, s.upper)
900        finally:
901            del s
902
903    def test_contains(self):
904        # Testing Unicode contains method
905        self.assertIn('a', 'abdb')
906        self.assertIn('a', 'bdab')
907        self.assertIn('a', 'bdaba')
908        self.assertIn('a', 'bdba')
909        self.assertNotIn('a', 'bdb')
910        self.assertIn('a', 'bdba')
911        self.assertIn('a', ('a',1,None))
912        self.assertIn('a', (1,None,'a'))
913        self.assertIn('a', ('a',1,None))
914        self.assertIn('a', (1,None,'a'))
915        self.assertNotIn('a', ('x',1,'y'))
916        self.assertNotIn('a', ('x',1,None))
917        self.assertNotIn('abcd', 'abcxxxx')
918        self.assertIn('ab', 'abcd')
919        self.assertIn('ab', 'abc')
920        self.assertIn('ab', (1,None,'ab'))
921        self.assertIn('', 'abc')
922        self.assertIn('', '')
923        self.assertIn('', 'abc')
924        self.assertNotIn('\0', 'abc')
925        self.assertIn('\0', '\0abc')
926        self.assertIn('\0', 'abc\0')
927        self.assertIn('a', '\0abc')
928        self.assertIn('asdf', 'asdf')
929        self.assertNotIn('asdf', 'asd')
930        self.assertNotIn('asdf', '')
931
932        self.assertRaises(TypeError, "abc".__contains__)
933        # test mixed kinds
934        for fill in ('a', '\u0100', '\U00010300'):
935            fill *= 9
936            for delim in ('c', '\u0102', '\U00010302'):
937                self.assertNotIn(delim, fill)
938                self.assertIn(delim, fill + delim)
939                self.assertNotIn(delim * 2, fill)
940                self.assertIn(delim * 2, fill + delim * 2)
941
942    def test_issue18183(self):
943        '\U00010000\U00100000'.lower()
944        '\U00010000\U00100000'.casefold()
945        '\U00010000\U00100000'.upper()
946        '\U00010000\U00100000'.capitalize()
947        '\U00010000\U00100000'.title()
948        '\U00010000\U00100000'.swapcase()
949        '\U00100000'.center(3, '\U00010000')
950        '\U00100000'.ljust(3, '\U00010000')
951        '\U00100000'.rjust(3, '\U00010000')
952
953    def test_format(self):
954        self.assertEqual(''.format(), '')
955        self.assertEqual('a'.format(), 'a')
956        self.assertEqual('ab'.format(), 'ab')
957        self.assertEqual('a{{'.format(), 'a{')
958        self.assertEqual('a}}'.format(), 'a}')
959        self.assertEqual('{{b'.format(), '{b')
960        self.assertEqual('}}b'.format(), '}b')
961        self.assertEqual('a{{b'.format(), 'a{b')
962
963        # examples from the PEP:
964        import datetime
965        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
966        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
967                         "My name is Fred")
968        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
969                         "My name is Fred :-{}")
970
971        d = datetime.date(2007, 8, 18)
972        self.assertEqual("The year is {0.year}".format(d),
973                         "The year is 2007")
974
975        # classes we'll use for testing
976        class C:
977            def __init__(self, x=100):
978                self._x = x
979            def __format__(self, spec):
980                return spec
981
982        class D:
983            def __init__(self, x):
984                self.x = x
985            def __format__(self, spec):
986                return str(self.x)
987
988        # class with __str__, but no __format__
989        class E:
990            def __init__(self, x):
991                self.x = x
992            def __str__(self):
993                return 'E(' + self.x + ')'
994
995        # class with __repr__, but no __format__ or __str__
996        class F:
997            def __init__(self, x):
998                self.x = x
999            def __repr__(self):
1000                return 'F(' + self.x + ')'
1001
1002        # class with __format__ that forwards to string, for some format_spec's
1003        class G:
1004            def __init__(self, x):
1005                self.x = x
1006            def __str__(self):
1007                return "string is " + self.x
1008            def __format__(self, format_spec):
1009                if format_spec == 'd':
1010                    return 'G(' + self.x + ')'
1011                return object.__format__(self, format_spec)
1012
1013        class I(datetime.date):
1014            def __format__(self, format_spec):
1015                return self.strftime(format_spec)
1016
1017        class J(int):
1018            def __format__(self, format_spec):
1019                return int.__format__(self * 2, format_spec)
1020
1021        class M:
1022            def __init__(self, x):
1023                self.x = x
1024            def __repr__(self):
1025                return 'M(' + self.x + ')'
1026            __str__ = None
1027
1028        class N:
1029            def __init__(self, x):
1030                self.x = x
1031            def __repr__(self):
1032                return 'N(' + self.x + ')'
1033            __format__ = None
1034
1035        self.assertEqual(''.format(), '')
1036        self.assertEqual('abc'.format(), 'abc')
1037        self.assertEqual('{0}'.format('abc'), 'abc')
1038        self.assertEqual('{0:}'.format('abc'), 'abc')
1039#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1040        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1041        self.assertEqual('{0}X'.format('abc'), 'abcX')
1042        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1043        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1044        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1045        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1046        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1047        self.assertEqual('{0}'.format(-15), '-15')
1048        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1049        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1050        self.assertEqual('{{'.format(), '{')
1051        self.assertEqual('}}'.format(), '}')
1052        self.assertEqual('{{}}'.format(), '{}')
1053        self.assertEqual('{{x}}'.format(), '{x}')
1054        self.assertEqual('{{{0}}}'.format(123), '{123}')
1055        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1056        self.assertEqual('}}{{'.format(), '}{')
1057        self.assertEqual('}}x{{'.format(), '}x{')
1058
1059        # weird field names
1060        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1061        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1062        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1063
1064        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1065        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1066        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1067        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1068        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1069        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1070        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1071
1072        # strings
1073        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1074        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1075        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1076        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1077        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1078        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1079        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1080        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1081        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1082        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1083        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1084        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1085        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1086        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1087        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1088        self.assertEqual('{0:>7s}'.format('result'), ' result')
1089        self.assertEqual('{0:>8s}'.format('result'), '  result')
1090        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1091        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1092        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1093        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1094        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1095        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1096
1097        # issue 12546: use \x00 as a fill character
1098        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1099        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1100        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1101        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1102
1103        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1104        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1105        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1106        self.assertEqual('{0:<6}'.format(3), '3     ')
1107
1108        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1109        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1110        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1111        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1112
1113        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1114        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1115        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1116        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1117
1118        # format specifiers for user defined type
1119        self.assertEqual('{0:abc}'.format(C()), 'abc')
1120
1121        # !r, !s and !a coercions
1122        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1123        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1124        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1125        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1126        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1127        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1128        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1129        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1130        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1131        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1132        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1133        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1134        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1135        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1136        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1137        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1138
1139        # test fallback to object.__format__
1140        self.assertEqual('{0}'.format({}), '{}')
1141        self.assertEqual('{0}'.format([]), '[]')
1142        self.assertEqual('{0}'.format([1]), '[1]')
1143
1144        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1145        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1146
1147        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1148        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1149        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1150
1151        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1152                                                       month=8,
1153                                                       day=27)),
1154                         "date: 2007-08-27")
1155
1156        # test deriving from a builtin type and overriding __format__
1157        self.assertEqual("{0}".format(J(10)), "20")
1158
1159
1160        # string format specifiers
1161        self.assertEqual('{0:}'.format('a'), 'a')
1162
1163        # computed format specifiers
1164        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1165        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1166        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1167        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1168        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1169
1170        # test various errors
1171        self.assertRaises(ValueError, '{'.format)
1172        self.assertRaises(ValueError, '}'.format)
1173        self.assertRaises(ValueError, 'a{'.format)
1174        self.assertRaises(ValueError, 'a}'.format)
1175        self.assertRaises(ValueError, '{a'.format)
1176        self.assertRaises(ValueError, '}a'.format)
1177        self.assertRaises(IndexError, '{0}'.format)
1178        self.assertRaises(IndexError, '{1}'.format, 'abc')
1179        self.assertRaises(KeyError,   '{x}'.format)
1180        self.assertRaises(ValueError, "}{".format)
1181        self.assertRaises(ValueError, "abc{0:{}".format)
1182        self.assertRaises(ValueError, "{0".format)
1183        self.assertRaises(IndexError, "{0.}".format)
1184        self.assertRaises(ValueError, "{0.}".format, 0)
1185        self.assertRaises(ValueError, "{0[}".format)
1186        self.assertRaises(ValueError, "{0[}".format, [])
1187        self.assertRaises(KeyError,   "{0]}".format)
1188        self.assertRaises(ValueError, "{0.[]}".format, 0)
1189        self.assertRaises(ValueError, "{0..foo}".format, 0)
1190        self.assertRaises(ValueError, "{0[0}".format, 0)
1191        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1192        self.assertRaises(KeyError,   "{c]}".format)
1193        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1194        self.assertRaises(ValueError, "{0}}".format, 0)
1195        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1196        self.assertRaises(ValueError, "{0!x}".format, 3)
1197        self.assertRaises(ValueError, "{0!}".format, 0)
1198        self.assertRaises(ValueError, "{0!rs}".format, 0)
1199        self.assertRaises(ValueError, "{!}".format)
1200        self.assertRaises(IndexError, "{:}".format)
1201        self.assertRaises(IndexError, "{:s}".format)
1202        self.assertRaises(IndexError, "{}".format)
1203        big = "23098475029384702983476098230754973209482573"
1204        self.assertRaises(ValueError, ("{" + big + "}").format)
1205        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1206
1207        # issue 6089
1208        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1209        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1210
1211        # can't have a replacement on the field name portion
1212        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1213
1214        # exceed maximum recursion depth
1215        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1216        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1217                          0, 1, 2, 3, 4, 5, 6, 7)
1218
1219        # string format spec errors
1220        self.assertRaises(ValueError, "{0:-s}".format, '')
1221        self.assertRaises(ValueError, format, "", "-")
1222        self.assertRaises(ValueError, "{0:=s}".format, '')
1223
1224        # Alternate formatting is not supported
1225        self.assertRaises(ValueError, format, '', '#')
1226        self.assertRaises(ValueError, format, '', '#20')
1227
1228        # Non-ASCII
1229        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1230                         'ABC\u0410\u0411\u0412')
1231        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1232                         'ABC')
1233        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1234                         '')
1235
1236        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1237        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1238        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1239        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1240        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1241        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1242        self.assertRaises(ValueError, "{a{}b}".format, 42)
1243        self.assertRaises(ValueError, "{a{b}".format, 42)
1244        self.assertRaises(ValueError, "{[}".format, 42)
1245
1246        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1247
1248        # Blocking fallback
1249        m = M('data')
1250        self.assertEqual("{!r}".format(m), 'M(data)')
1251        self.assertRaises(TypeError, "{!s}".format, m)
1252        self.assertRaises(TypeError, "{}".format, m)
1253        n = N('data')
1254        self.assertEqual("{!r}".format(n), 'N(data)')
1255        self.assertEqual("{!s}".format(n), 'N(data)')
1256        self.assertRaises(TypeError, "{}".format, n)
1257
1258    def test_format_map(self):
1259        self.assertEqual(''.format_map({}), '')
1260        self.assertEqual('a'.format_map({}), 'a')
1261        self.assertEqual('ab'.format_map({}), 'ab')
1262        self.assertEqual('a{{'.format_map({}), 'a{')
1263        self.assertEqual('a}}'.format_map({}), 'a}')
1264        self.assertEqual('{{b'.format_map({}), '{b')
1265        self.assertEqual('}}b'.format_map({}), '}b')
1266        self.assertEqual('a{{b'.format_map({}), 'a{b')
1267
1268        # using mappings
1269        class Mapping(dict):
1270            def __missing__(self, key):
1271                return key
1272        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1273        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1274
1275        class InternalMapping:
1276            def __init__(self):
1277                self.mapping = {'a': 'hello'}
1278            def __getitem__(self, key):
1279                return self.mapping[key]
1280        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1281
1282
1283        class C:
1284            def __init__(self, x=100):
1285                self._x = x
1286            def __format__(self, spec):
1287                return spec
1288        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1289
1290        # test various errors
1291        self.assertRaises(TypeError, ''.format_map)
1292        self.assertRaises(TypeError, 'a'.format_map)
1293
1294        self.assertRaises(ValueError, '{'.format_map, {})
1295        self.assertRaises(ValueError, '}'.format_map, {})
1296        self.assertRaises(ValueError, 'a{'.format_map, {})
1297        self.assertRaises(ValueError, 'a}'.format_map, {})
1298        self.assertRaises(ValueError, '{a'.format_map, {})
1299        self.assertRaises(ValueError, '}a'.format_map, {})
1300
1301        # issue #12579: can't supply positional params to format_map
1302        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1303        self.assertRaises(ValueError, '{}'.format_map, 'a')
1304        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1305
1306        class BadMapping:
1307            def __getitem__(self, key):
1308                return 1/0
1309        self.assertRaises(KeyError, '{a}'.format_map, {})
1310        self.assertRaises(TypeError, '{a}'.format_map, [])
1311        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1312
1313    def test_format_huge_precision(self):
1314        format_string = ".{}f".format(sys.maxsize + 1)
1315        with self.assertRaises(ValueError):
1316            result = format(2.34, format_string)
1317
1318    def test_format_huge_width(self):
1319        format_string = "{}f".format(sys.maxsize + 1)
1320        with self.assertRaises(ValueError):
1321            result = format(2.34, format_string)
1322
1323    def test_format_huge_item_number(self):
1324        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1325        with self.assertRaises(ValueError):
1326            result = format_string.format(2.34)
1327
1328    def test_format_auto_numbering(self):
1329        class C:
1330            def __init__(self, x=100):
1331                self._x = x
1332            def __format__(self, spec):
1333                return spec
1334
1335        self.assertEqual('{}'.format(10), '10')
1336        self.assertEqual('{:5}'.format('s'), 's    ')
1337        self.assertEqual('{!r}'.format('s'), "'s'")
1338        self.assertEqual('{._x}'.format(C(10)), '10')
1339        self.assertEqual('{[1]}'.format([1, 2]), '2')
1340        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1341        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1342
1343        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1344        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1345
1346        # can't mix and match numbering and auto-numbering
1347        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1348        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1349        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1350        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1351
1352        # can mix and match auto-numbering and named
1353        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1354        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1355        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1356        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1357
1358    def test_formatting(self):
1359        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1360        # Testing Unicode formatting strings...
1361        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1362        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1363        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1364        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1365        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1366        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1367        if not sys.platform.startswith('java'):
1368            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1369            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1370            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1371        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1372        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1373
1374        self.assertEqual('%c' % 0x1234, '\u1234')
1375        self.assertEqual('%c' % 0x21483, '\U00021483')
1376        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1377        self.assertEqual('%c' % '\U00021483', '\U00021483')
1378        self.assertRaises(TypeError, "%c".__mod__, "aa")
1379        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1380        self.assertRaises(TypeError, "%i".__mod__, "aa")
1381
1382        # formatting jobs delegated from the string implementation:
1383        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1384        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1385        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1386        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1387        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1388        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1389        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1390        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1391        self.assertEqual('...%s...' % "abc", '...abc...')
1392        self.assertEqual('%*s' % (5,'abc',), '  abc')
1393        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1394        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1395        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1396        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1397        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1398        self.assertEqual('%c' % 'a', 'a')
1399        class Wrapper:
1400            def __str__(self):
1401                return '\u1234'
1402        self.assertEqual('%s' % Wrapper(), '\u1234')
1403
1404        # issue 3382
1405        NAN = float('nan')
1406        INF = float('inf')
1407        self.assertEqual('%f' % NAN, 'nan')
1408        self.assertEqual('%F' % NAN, 'NAN')
1409        self.assertEqual('%f' % INF, 'inf')
1410        self.assertEqual('%F' % INF, 'INF')
1411
1412        # PEP 393
1413        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1414        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1415
1416        #issue 19995
1417        class PseudoInt:
1418            def __init__(self, value):
1419                self.value = int(value)
1420            def __int__(self):
1421                return self.value
1422            def __index__(self):
1423                return self.value
1424        class PseudoFloat:
1425            def __init__(self, value):
1426                self.value = float(value)
1427            def __int__(self):
1428                return int(self.value)
1429        pi = PseudoFloat(3.1415)
1430        letter_m = PseudoInt(109)
1431        self.assertEqual('%x' % 42, '2a')
1432        self.assertEqual('%X' % 15, 'F')
1433        self.assertEqual('%o' % 9, '11')
1434        self.assertEqual('%c' % 109, 'm')
1435        self.assertEqual('%x' % letter_m, '6d')
1436        self.assertEqual('%X' % letter_m, '6D')
1437        self.assertEqual('%o' % letter_m, '155')
1438        self.assertEqual('%c' % letter_m, 'm')
1439        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1440        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1441        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1442        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1443        self.assertRaises(TypeError, operator.mod, '%c', pi),
1444
1445    def test_formatting_with_enum(self):
1446        # issue18780
1447        import enum
1448        class Float(float, enum.Enum):
1449            PI = 3.1415926
1450        class Int(enum.IntEnum):
1451            IDES = 15
1452        class Str(str, enum.Enum):
1453            ABC = 'abc'
1454        # Testing Unicode formatting strings...
1455        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1456                         'Str.ABC, Str.ABC')
1457        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1458                        (Str.ABC, Str.ABC,
1459                         Int.IDES, Int.IDES, Int.IDES,
1460                         Float.PI, Float.PI),
1461                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1462
1463        # formatting jobs delegated from the string implementation:
1464        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1465                         '...Str.ABC...')
1466        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1467                         '...Int.IDES...')
1468        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1469                         '...15...')
1470        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1471                         '...15...')
1472        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1473                         '...15...')
1474        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1475                         '...3.141593...')
1476
1477    def test_formatting_huge_precision(self):
1478        format_string = "%.{}f".format(sys.maxsize + 1)
1479        with self.assertRaises(ValueError):
1480            result = format_string % 2.34
1481
1482    def test_issue28598_strsubclass_rhs(self):
1483        # A subclass of str with an __rmod__ method should be able to hook
1484        # into the % operator
1485        class SubclassedStr(str):
1486            def __rmod__(self, other):
1487                return 'Success, self.__rmod__({!r}) was called'.format(other)
1488        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1489                         "Success, self.__rmod__('lhs %% %r') was called")
1490
1491    @support.cpython_only
1492    def test_formatting_huge_precision_c_limits(self):
1493        from _testcapi import INT_MAX
1494        format_string = "%.{}f".format(INT_MAX + 1)
1495        with self.assertRaises(ValueError):
1496            result = format_string % 2.34
1497
1498    def test_formatting_huge_width(self):
1499        format_string = "%{}f".format(sys.maxsize + 1)
1500        with self.assertRaises(ValueError):
1501            result = format_string % 2.34
1502
1503    def test_startswith_endswith_errors(self):
1504        for meth in ('foo'.startswith, 'foo'.endswith):
1505            with self.assertRaises(TypeError) as cm:
1506                meth(['f'])
1507            exc = str(cm.exception)
1508            self.assertIn('str', exc)
1509            self.assertIn('tuple', exc)
1510
1511    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1512    def test_format_float(self):
1513        # should not format with a comma, but always with C locale
1514        self.assertEqual('1.0', '%.1f' % 1.0)
1515
1516    def test_constructor(self):
1517        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1518
1519        self.assertEqual(
1520            str('unicode remains unicode'),
1521            'unicode remains unicode'
1522        )
1523
1524        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1525            subclass = StrSubclass(text)
1526            self.assertEqual(str(subclass), text)
1527            self.assertEqual(len(subclass), len(text))
1528            if text == 'ascii':
1529                self.assertEqual(subclass.encode('ascii'), b'ascii')
1530                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1531
1532        self.assertEqual(
1533            str('strings are converted to unicode'),
1534            'strings are converted to unicode'
1535        )
1536
1537        class StringCompat:
1538            def __init__(self, x):
1539                self.x = x
1540            def __str__(self):
1541                return self.x
1542
1543        self.assertEqual(
1544            str(StringCompat('__str__ compatible objects are recognized')),
1545            '__str__ compatible objects are recognized'
1546        )
1547
1548        # unicode(obj) is compatible to str():
1549
1550        o = StringCompat('unicode(obj) is compatible to str()')
1551        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1552        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1553
1554        for obj in (123, 123.45, 123):
1555            self.assertEqual(str(obj), str(str(obj)))
1556
1557        # unicode(obj, encoding, error) tests (this maps to
1558        # PyUnicode_FromEncodedObject() at C level)
1559
1560        if not sys.platform.startswith('java'):
1561            self.assertRaises(
1562                TypeError,
1563                str,
1564                'decoding unicode is not supported',
1565                'utf-8',
1566                'strict'
1567            )
1568
1569        self.assertEqual(
1570            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1571            'strings are decoded to unicode'
1572        )
1573
1574        if not sys.platform.startswith('java'):
1575            self.assertEqual(
1576                str(
1577                    memoryview(b'character buffers are decoded to unicode'),
1578                    'utf-8',
1579                    'strict'
1580                ),
1581                'character buffers are decoded to unicode'
1582            )
1583
1584        self.assertRaises(TypeError, str, 42, 42, 42)
1585
1586    def test_constructor_keyword_args(self):
1587        """Pass various keyword argument combinations to the constructor."""
1588        # The object argument can be passed as a keyword.
1589        self.assertEqual(str(object='foo'), 'foo')
1590        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1591        # The errors argument without encoding triggers "decode" mode.
1592        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1593        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1594
1595    def test_constructor_defaults(self):
1596        """Check the constructor argument defaults."""
1597        # The object argument defaults to '' or b''.
1598        self.assertEqual(str(), '')
1599        self.assertEqual(str(errors='strict'), '')
1600        utf8_cent = '¢'.encode('utf-8')
1601        # The encoding argument defaults to utf-8.
1602        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1603        # The errors argument defaults to strict.
1604        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1605
1606    def test_codecs_utf7(self):
1607        utfTests = [
1608            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1609            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1610            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1611            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1612            ('+', b'+-'),
1613            ('+-', b'+--'),
1614            ('+?', b'+-?'),
1615            (r'\?', b'+AFw?'),
1616            ('+?', b'+-?'),
1617            (r'\\?', b'+AFwAXA?'),
1618            (r'\\\?', b'+AFwAXABc?'),
1619            (r'++--', b'+-+---'),
1620            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1621            ('/', b'/'),
1622        ]
1623
1624        for (x, y) in utfTests:
1625            self.assertEqual(x.encode('utf-7'), y)
1626
1627        # Unpaired surrogates are passed through
1628        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1629        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1630        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1631        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1632        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1633        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1634        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1635        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1636
1637        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1638        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1639
1640        # Issue #2242: crash on some Windows/MSVC versions
1641        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1642
1643        # Direct encoded characters
1644        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1645        # Optional direct characters
1646        set_o = '!"#$%&*;<=>@[]^_`{|}'
1647        for c in set_d:
1648            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1649            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1650        for c in set_o:
1651            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1652
1653        with self.assertRaisesRegex(UnicodeDecodeError,
1654                                    'ill-formed sequence'):
1655            b'+@'.decode('utf-7')
1656
1657    def test_codecs_utf8(self):
1658        self.assertEqual(''.encode('utf-8'), b'')
1659        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1660        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1661        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1662        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1663        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1664        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1665                         b'\xf0\x90\x80\x82'*10)
1666        self.assertEqual(
1667            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1668            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1669            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1670            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1671            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1672            ' Nunstuck git und'.encode('utf-8'),
1673            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1674            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1675            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1676            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1677            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1678            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1679            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1680            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1681            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1682            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1683        )
1684
1685        # UTF-8 specific decoding tests
1686        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1687        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1688        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1689
1690        # Other possible utf-8 test cases:
1691        # * strict decoding testing for all of the
1692        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1693
1694    def test_utf8_decode_valid_sequences(self):
1695        sequences = [
1696            # single byte
1697            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1698            # 2 bytes
1699            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1700            # 3 bytes
1701            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1702            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1703            # 4 bytes
1704            (b'\xF0\x90\x80\x80', '\U00010000'),
1705            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1706        ]
1707        for seq, res in sequences:
1708            self.assertEqual(seq.decode('utf-8'), res)
1709
1710
1711    def test_utf8_decode_invalid_sequences(self):
1712        # continuation bytes in a sequence of 2, 3, or 4 bytes
1713        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1714        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1715        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1716        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1717        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1718        invalid_start_bytes = (
1719            continuation_bytes + invalid_2B_seq_start_bytes +
1720            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1721        )
1722
1723        for byte in invalid_start_bytes:
1724            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1725
1726        for sb in invalid_2B_seq_start_bytes:
1727            for cb in continuation_bytes:
1728                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1729
1730        for sb in invalid_4B_seq_start_bytes:
1731            for cb1 in continuation_bytes[:3]:
1732                for cb3 in continuation_bytes[:3]:
1733                    self.assertRaises(UnicodeDecodeError,
1734                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1735
1736        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1737            self.assertRaises(UnicodeDecodeError,
1738                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1739            self.assertRaises(UnicodeDecodeError,
1740                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1741        # surrogates
1742        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1743            self.assertRaises(UnicodeDecodeError,
1744                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1745            self.assertRaises(UnicodeDecodeError,
1746                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1747        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1748            self.assertRaises(UnicodeDecodeError,
1749                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1750            self.assertRaises(UnicodeDecodeError,
1751                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1752        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1753            self.assertRaises(UnicodeDecodeError,
1754                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1755            self.assertRaises(UnicodeDecodeError,
1756                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1757
1758    def test_issue8271(self):
1759        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1760        # only the start byte and the continuation byte(s) are now considered
1761        # invalid, instead of the number of bytes specified by the start byte.
1762        # See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1763        # table 3-8, Row 2) for more information about the algorithm used.
1764        FFFD = '\ufffd'
1765        sequences = [
1766            # invalid start bytes
1767            (b'\x80', FFFD), # continuation byte
1768            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1769            (b'\xc0', FFFD),
1770            (b'\xc0\xc0', FFFD*2),
1771            (b'\xc1', FFFD),
1772            (b'\xc1\xc0', FFFD*2),
1773            (b'\xc0\xc1', FFFD*2),
1774            # with start byte of a 2-byte sequence
1775            (b'\xc2', FFFD), # only the start byte
1776            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1777            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1778            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1779            # with start byte of a 3-byte sequence
1780            (b'\xe1', FFFD), # only the start byte
1781            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1782            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1783            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1784            (b'\xe1\x80', FFFD), # only 1 continuation byte
1785            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1786            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1787            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1788            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1789            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1790            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1791            # with start byte of a 4-byte sequence
1792            (b'\xf1', FFFD), # only the start byte
1793            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1794            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1795            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1796            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1797            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1798            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1799            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1800            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1801            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1802            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1803            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1804            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1805            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1806            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1807            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1808            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1809            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1810            # with invalid start byte of a 4-byte sequence (rfc2279)
1811            (b'\xf5', FFFD), # only the start byte
1812            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1813            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1814            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1815            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1816            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1817            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1818            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1819            # with invalid start byte of a 5-byte sequence (rfc2279)
1820            (b'\xf8', FFFD), # only the start byte
1821            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1822            (b'\xf8\x80', FFFD*2), # only one continuation byte
1823            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1824            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1825            # with invalid start byte of a 6-byte sequence (rfc2279)
1826            (b'\xfc', FFFD), # only the start byte
1827            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1828            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1829            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1830            # invalid start byte
1831            (b'\xfe', FFFD),
1832            (b'\xfe\x80\x80', FFFD*3),
1833            # other sequences
1834            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1835            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1836            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1837            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1838             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1839        ]
1840        for n, (seq, res) in enumerate(sequences):
1841            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1842            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1843            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1844            self.assertEqual(seq.decode('utf-8', 'ignore'),
1845                             res.replace('\uFFFD', ''))
1846
1847    def assertCorrectUTF8Decoding(self, seq, res, err):
1848        """
1849        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1850        'strict' is used, returns res when 'replace' is used, and that doesn't
1851        return anything when 'ignore' is used.
1852        """
1853        with self.assertRaises(UnicodeDecodeError) as cm:
1854            seq.decode('utf-8')
1855        exc = cm.exception
1856
1857        self.assertIn(err, str(exc))
1858        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1859        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1860                         'aaaa' + res + 'bbbb')
1861        res = res.replace('\ufffd', '')
1862        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1863        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1864                          'aaaa' + res + 'bbbb')
1865
1866    def test_invalid_start_byte(self):
1867        """
1868        Test that an 'invalid start byte' error is raised when the first byte
1869        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1870        4-bytes sequence. The invalid start byte is replaced with a single
1871        U+FFFD when errors='replace'.
1872        E.g. <80> is a continuation byte and can appear only after a start byte.
1873        """
1874        FFFD = '\ufffd'
1875        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1876            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1877                                           'invalid start byte')
1878
1879    def test_unexpected_end_of_data(self):
1880        """
1881        Test that an 'unexpected end of data' error is raised when the string
1882        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1883        enough continuation bytes.  The incomplete sequence is replaced with a
1884        single U+FFFD when errors='replace'.
1885        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1886        sequence, but it's followed by only 2 valid continuation bytes and the
1887        last continuation bytes is missing.
1888        Note: the continuation bytes must be all valid, if one of them is
1889        invalid another error will be raised.
1890        """
1891        sequences = [
1892            'C2', 'DF',
1893            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1894            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1895            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1896            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1897            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1898            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1899        ]
1900        FFFD = '\ufffd'
1901        for seq in sequences:
1902            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1903                                           'unexpected end of data')
1904
1905    def test_invalid_cb_for_2bytes_seq(self):
1906        """
1907        Test that an 'invalid continuation byte' error is raised when the
1908        continuation byte of a 2-bytes sequence is invalid.  The start byte
1909        is replaced by a single U+FFFD and the second byte is handled
1910        separately when errors='replace'.
1911        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1912        sequence, but 41 is not a valid continuation byte because it's the
1913        ASCII letter 'A'.
1914        """
1915        FFFD = '\ufffd'
1916        FFFDx2 = FFFD * 2
1917        sequences = [
1918            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1919            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1920            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1921            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1922        ]
1923        for seq, res in sequences:
1924            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1925                                           'invalid continuation byte')
1926
1927    def test_invalid_cb_for_3bytes_seq(self):
1928        """
1929        Test that an 'invalid continuation byte' error is raised when the
1930        continuation byte(s) of a 3-bytes sequence are invalid.  When
1931        errors='replace', if the first continuation byte is valid, the first
1932        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1933        third byte is handled separately, otherwise only the start byte is
1934        replaced with a U+FFFD and the other continuation bytes are handled
1935        separately.
1936        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1937        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1938        because it's the ASCII letter 'A'.
1939        Note: when the start byte is E0 or ED, the valid ranges for the first
1940        continuation byte are limited to A0..BF and 80..9F respectively.
1941        Python 2 used to consider all the bytes in range 80..BF valid when the
1942        start byte was ED.  This is fixed in Python 3.
1943        """
1944        FFFD = '\ufffd'
1945        FFFDx2 = FFFD * 2
1946        sequences = [
1947            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1948            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1949            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1950            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1951            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1952            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1953            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1954            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1955            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1956            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1957            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1958            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1959            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1960            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1961            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1962            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1963            ('ED 7F', FFFD+'\x7f'),
1964            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1965            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1966            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1967            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1968            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1969            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1970            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1971            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1972            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1973            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1974            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1975            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1976            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1977            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1978            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1979            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1980        ]
1981        for seq, res in sequences:
1982            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1983                                           'invalid continuation byte')
1984
1985    def test_invalid_cb_for_4bytes_seq(self):
1986        """
1987        Test that an 'invalid continuation byte' error is raised when the
1988        continuation byte(s) of a 4-bytes sequence are invalid.  When
1989        errors='replace',the start byte and all the following valid
1990        continuation bytes are replaced with a single U+FFFD, and all the bytes
1991        starting from the first invalid continuation bytes (included) are
1992        handled separately.
1993        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1994        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1995        because it's the ASCII letter 'A'.
1996        Note: when the start byte is E0 or ED, the valid ranges for the first
1997        continuation byte are limited to A0..BF and 80..9F respectively.
1998        However, when the start byte is ED, Python 2 considers all the bytes
1999        in range 80..BF valid.  This is fixed in Python 3.
2000        """
2001        FFFD = '\ufffd'
2002        FFFDx2 = FFFD * 2
2003        sequences = [
2004            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
2005            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
2006            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
2007            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
2008            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2009            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2010            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2011            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2012            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2013            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2014            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2015            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2016            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2017            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2018            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2019            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2020            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2021            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2022            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2023            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2024            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2025            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2026            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2027            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2028            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2029            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2030            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2031            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2032            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2033            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2034            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2035            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2036            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2037            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2038            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2039            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2040            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2041            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2042            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2043            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2044            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2045            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2046            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2047            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2048            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2049            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2050            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2051            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2052            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2053            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2054            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2055            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2056            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2057            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2058            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2059        ]
2060        for seq, res in sequences:
2061            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2062                                           'invalid continuation byte')
2063
2064    def test_codecs_idna(self):
2065        # Test whether trailing dot is preserved
2066        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2067
2068    def test_codecs_errors(self):
2069        # Error handling (encoding)
2070        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2071        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2072        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2073        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2074        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2075                         'Andr\202 x'.encode('ascii', errors='replace'))
2076        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2077                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2078
2079        # Error handling (decoding)
2080        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2081        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2082        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2083        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2084        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2085
2086        # Error handling (unknown character names)
2087        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2088
2089        # Error handling (truncated escape sequence)
2090        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2091
2092        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2093        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2094        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2095        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2096
2097        # Error handling (wrong arguments)
2098        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2099
2100        # Error handling (lone surrogate in
2101        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2102        self.assertRaises(ValueError, int, "\ud800")
2103        self.assertRaises(ValueError, int, "\udf00")
2104        self.assertRaises(ValueError, float, "\ud800")
2105        self.assertRaises(ValueError, float, "\udf00")
2106        self.assertRaises(ValueError, complex, "\ud800")
2107        self.assertRaises(ValueError, complex, "\udf00")
2108
2109    def test_codecs(self):
2110        # Encoding
2111        self.assertEqual('hello'.encode('ascii'), b'hello')
2112        self.assertEqual('hello'.encode('utf-7'), b'hello')
2113        self.assertEqual('hello'.encode('utf-8'), b'hello')
2114        self.assertEqual('hello'.encode('utf-8'), b'hello')
2115        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2116        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2117        self.assertEqual('hello'.encode('latin-1'), b'hello')
2118
2119        # Default encoding is utf-8
2120        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2121
2122        # Roundtrip safety for BMP (just the first 1024 chars)
2123        for c in range(1024):
2124            u = chr(c)
2125            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2126                             'utf-16-be', 'raw_unicode_escape',
2127                             'unicode_escape'):
2128                self.assertEqual(str(u.encode(encoding),encoding), u)
2129
2130        # Roundtrip safety for BMP (just the first 256 chars)
2131        for c in range(256):
2132            u = chr(c)
2133            for encoding in ('latin-1',):
2134                self.assertEqual(str(u.encode(encoding),encoding), u)
2135
2136        # Roundtrip safety for BMP (just the first 128 chars)
2137        for c in range(128):
2138            u = chr(c)
2139            for encoding in ('ascii',):
2140                self.assertEqual(str(u.encode(encoding),encoding), u)
2141
2142        # Roundtrip safety for non-BMP (just a few chars)
2143        with warnings.catch_warnings():
2144            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2145            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2146                             'raw_unicode_escape', 'unicode_escape'):
2147                self.assertEqual(str(u.encode(encoding),encoding), u)
2148
2149        # UTF-8 must be roundtrip safe for all code points
2150        # (except surrogates, which are forbidden).
2151        u = ''.join(map(chr, list(range(0, 0xd800)) +
2152                             list(range(0xe000, 0x110000))))
2153        for encoding in ('utf-8',):
2154            self.assertEqual(str(u.encode(encoding),encoding), u)
2155
2156    def test_codecs_charmap(self):
2157        # 0-127
2158        s = bytes(range(128))
2159        for encoding in (
2160            'cp037', 'cp1026', 'cp273',
2161            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2162            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2163            'cp863', 'cp865', 'cp866', 'cp1125',
2164            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2165            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2166            'iso8859_7', 'iso8859_9',
2167            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2168            'mac_cyrillic', 'mac_latin2',
2169
2170            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2171            'cp1256', 'cp1257', 'cp1258',
2172            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2173
2174            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2175            'cp1006', 'iso8859_8',
2176
2177            ### These have undefined mappings:
2178            #'cp424',
2179
2180            ### These fail the round-trip:
2181            #'cp875'
2182
2183            ):
2184            self.assertEqual(str(s, encoding).encode(encoding), s)
2185
2186        # 128-255
2187        s = bytes(range(128, 256))
2188        for encoding in (
2189            'cp037', 'cp1026', 'cp273',
2190            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2191            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2192            'cp863', 'cp865', 'cp866', 'cp1125',
2193            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2194            'iso8859_2', 'iso8859_4', 'iso8859_5',
2195            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2196            'mac_cyrillic', 'mac_latin2',
2197
2198            ### These have undefined mappings:
2199            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2200            #'cp1256', 'cp1257', 'cp1258',
2201            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2202            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2203            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2204
2205            ### These fail the round-trip:
2206            #'cp1006', 'cp875', 'iso8859_8',
2207
2208            ):
2209            self.assertEqual(str(s, encoding).encode(encoding), s)
2210
2211    def test_concatenation(self):
2212        self.assertEqual(("abc" "def"), "abcdef")
2213        self.assertEqual(("abc" "def"), "abcdef")
2214        self.assertEqual(("abc" "def"), "abcdef")
2215        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2216        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2217
2218    def test_printing(self):
2219        class BitBucket:
2220            def write(self, text):
2221                pass
2222
2223        out = BitBucket()
2224        print('abc', file=out)
2225        print('abc', 'def', file=out)
2226        print('abc', 'def', file=out)
2227        print('abc', 'def', file=out)
2228        print('abc\n', file=out)
2229        print('abc\n', end=' ', file=out)
2230        print('abc\n', end=' ', file=out)
2231        print('def\n', file=out)
2232        print('def\n', file=out)
2233
2234    def test_ucs4(self):
2235        x = '\U00100000'
2236        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2237        self.assertEqual(x, y)
2238
2239        y = br'\U00100000'
2240        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2241        self.assertEqual(x, y)
2242        y = br'\U00010000'
2243        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2244        self.assertEqual(x, y)
2245
2246        try:
2247            br'\U11111111'.decode("raw-unicode-escape")
2248        except UnicodeDecodeError as e:
2249            self.assertEqual(e.start, 0)
2250            self.assertEqual(e.end, 10)
2251        else:
2252            self.fail("Should have raised UnicodeDecodeError")
2253
2254    def test_conversion(self):
2255        # Make sure __str__() works properly
2256        class ObjectToStr:
2257            def __str__(self):
2258                return "foo"
2259
2260        class StrSubclassToStr(str):
2261            def __str__(self):
2262                return "foo"
2263
2264        class StrSubclassToStrSubclass(str):
2265            def __new__(cls, content=""):
2266                return str.__new__(cls, 2*content)
2267            def __str__(self):
2268                return self
2269
2270        self.assertEqual(str(ObjectToStr()), "foo")
2271        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2272        s = str(StrSubclassToStrSubclass("foo"))
2273        self.assertEqual(s, "foofoo")
2274        self.assertIs(type(s), StrSubclassToStrSubclass)
2275        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2276        self.assertEqual(s, "foofoo")
2277        self.assertIs(type(s), StrSubclass)
2278
2279    def test_unicode_repr(self):
2280        class s1:
2281            def __repr__(self):
2282                return '\\n'
2283
2284        class s2:
2285            def __repr__(self):
2286                return '\\n'
2287
2288        self.assertEqual(repr(s1()), '\\n')
2289        self.assertEqual(repr(s2()), '\\n')
2290
2291    def test_printable_repr(self):
2292        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2293        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2294
2295    # This test only affects 32-bit platforms because expandtabs can only take
2296    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2297    # to take a 64-bit long, this test should apply to all platforms.
2298    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2299                     'only applies to 32-bit platforms')
2300    def test_expandtabs_overflows_gracefully(self):
2301        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2302
2303    @support.cpython_only
2304    def test_expandtabs_optimization(self):
2305        s = 'abc'
2306        self.assertIs(s.expandtabs(), s)
2307
2308    def test_raiseMemError(self):
2309        if struct.calcsize('P') == 8:
2310            # 64 bits pointers
2311            ascii_struct_size = 48
2312            compact_struct_size = 72
2313        else:
2314            # 32 bits pointers
2315            ascii_struct_size = 24
2316            compact_struct_size = 36
2317
2318        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2319            code = ord(char)
2320            if code < 0x100:
2321                char_size = 1  # sizeof(Py_UCS1)
2322                struct_size = ascii_struct_size
2323            elif code < 0x10000:
2324                char_size = 2  # sizeof(Py_UCS2)
2325                struct_size = compact_struct_size
2326            else:
2327                char_size = 4  # sizeof(Py_UCS4)
2328                struct_size = compact_struct_size
2329            # Note: sys.maxsize is half of the actual max allocation because of
2330            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2331            # be allocatable, given enough memory.
2332            maxlen = ((sys.maxsize - struct_size) // char_size)
2333            alloc = lambda: char * maxlen
2334            self.assertRaises(MemoryError, alloc)
2335            self.assertRaises(MemoryError, alloc)
2336
2337    def test_format_subclass(self):
2338        class S(str):
2339            def __str__(self):
2340                return '__str__ overridden'
2341        s = S('xxx')
2342        self.assertEqual("%s" % s, '__str__ overridden')
2343        self.assertEqual("{}".format(s), '__str__ overridden')
2344
2345    def test_subclass_add(self):
2346        class S(str):
2347            def __add__(self, o):
2348                return "3"
2349        self.assertEqual(S("4") + S("5"), "3")
2350        class S(str):
2351            def __iadd__(self, o):
2352                return "3"
2353        s = S("1")
2354        s += "4"
2355        self.assertEqual(s, "3")
2356
2357    def test_getnewargs(self):
2358        text = 'abc'
2359        args = text.__getnewargs__()
2360        self.assertIsNot(args[0], text)
2361        self.assertEqual(args[0], text)
2362        self.assertEqual(len(args), 1)
2363
2364    @support.cpython_only
2365    def test_resize(self):
2366        from _testcapi import getargs_u
2367        for length in range(1, 100, 7):
2368            # generate a fresh string (refcount=1)
2369            text = 'a' * length + 'b'
2370
2371            # fill wstr internal field
2372            abc = getargs_u(text)
2373            self.assertEqual(abc, text)
2374
2375            # resize text: wstr field must be cleared and then recomputed
2376            text += 'c'
2377            abcdef = getargs_u(text)
2378            self.assertNotEqual(abc, abcdef)
2379            self.assertEqual(abcdef, text)
2380
2381    def test_compare(self):
2382        # Issue #17615
2383        N = 10
2384        ascii = 'a' * N
2385        ascii2 = 'z' * N
2386        latin = '\x80' * N
2387        latin2 = '\xff' * N
2388        bmp = '\u0100' * N
2389        bmp2 = '\uffff' * N
2390        astral = '\U00100000' * N
2391        astral2 = '\U0010ffff' * N
2392        strings = (
2393            ascii, ascii2,
2394            latin, latin2,
2395            bmp, bmp2,
2396            astral, astral2)
2397        for text1, text2 in itertools.combinations(strings, 2):
2398            equal = (text1 is text2)
2399            self.assertEqual(text1 == text2, equal)
2400            self.assertEqual(text1 != text2, not equal)
2401
2402            if equal:
2403                self.assertTrue(text1 <= text2)
2404                self.assertTrue(text1 >= text2)
2405
2406                # text1 is text2: duplicate strings to skip the "str1 == str2"
2407                # optimization in unicode_compare_eq() and really compare
2408                # character per character
2409                copy1 = duplicate_string(text1)
2410                copy2 = duplicate_string(text2)
2411                self.assertIsNot(copy1, copy2)
2412
2413                self.assertTrue(copy1 == copy2)
2414                self.assertFalse(copy1 != copy2)
2415
2416                self.assertTrue(copy1 <= copy2)
2417                self.assertTrue(copy2 >= copy2)
2418
2419        self.assertTrue(ascii < ascii2)
2420        self.assertTrue(ascii < latin)
2421        self.assertTrue(ascii < bmp)
2422        self.assertTrue(ascii < astral)
2423        self.assertFalse(ascii >= ascii2)
2424        self.assertFalse(ascii >= latin)
2425        self.assertFalse(ascii >= bmp)
2426        self.assertFalse(ascii >= astral)
2427
2428        self.assertFalse(latin < ascii)
2429        self.assertTrue(latin < latin2)
2430        self.assertTrue(latin < bmp)
2431        self.assertTrue(latin < astral)
2432        self.assertTrue(latin >= ascii)
2433        self.assertFalse(latin >= latin2)
2434        self.assertFalse(latin >= bmp)
2435        self.assertFalse(latin >= astral)
2436
2437        self.assertFalse(bmp < ascii)
2438        self.assertFalse(bmp < latin)
2439        self.assertTrue(bmp < bmp2)
2440        self.assertTrue(bmp < astral)
2441        self.assertTrue(bmp >= ascii)
2442        self.assertTrue(bmp >= latin)
2443        self.assertFalse(bmp >= bmp2)
2444        self.assertFalse(bmp >= astral)
2445
2446        self.assertFalse(astral < ascii)
2447        self.assertFalse(astral < latin)
2448        self.assertFalse(astral < bmp2)
2449        self.assertTrue(astral < astral2)
2450        self.assertTrue(astral >= ascii)
2451        self.assertTrue(astral >= latin)
2452        self.assertTrue(astral >= bmp2)
2453        self.assertFalse(astral >= astral2)
2454
2455    def test_free_after_iterating(self):
2456        support.check_free_after_iterating(self, iter, str)
2457        support.check_free_after_iterating(self, reversed, str)
2458
2459    def test_check_encoding_errors(self):
2460        # bpo-37388: str(bytes) and str.decode() must check encoding and errors
2461        # arguments in dev mode
2462        encodings = ('ascii', 'utf8', 'latin1')
2463        invalid = 'Boom, Shaka Laka, Boom!'
2464        code = textwrap.dedent(f'''
2465            import sys
2466            encodings = {encodings!r}
2467
2468            for data in (b'', b'short string'):
2469                try:
2470                    str(data, encoding={invalid!r})
2471                except LookupError:
2472                    pass
2473                else:
2474                    sys.exit(21)
2475
2476                try:
2477                    str(data, errors={invalid!r})
2478                except LookupError:
2479                    pass
2480                else:
2481                    sys.exit(22)
2482
2483                for encoding in encodings:
2484                    try:
2485                        str(data, encoding, errors={invalid!r})
2486                    except LookupError:
2487                        pass
2488                    else:
2489                        sys.exit(22)
2490
2491            for data in ('', 'short string'):
2492                try:
2493                    data.encode(encoding={invalid!r})
2494                except LookupError:
2495                    pass
2496                else:
2497                    sys.exit(23)
2498
2499                try:
2500                    data.encode(errors={invalid!r})
2501                except LookupError:
2502                    pass
2503                else:
2504                    sys.exit(24)
2505
2506                for encoding in encodings:
2507                    try:
2508                        data.encode(encoding, errors={invalid!r})
2509                    except LookupError:
2510                        pass
2511                    else:
2512                        sys.exit(24)
2513
2514            sys.exit(10)
2515        ''')
2516        proc = assert_python_failure('-X', 'dev', '-c', code)
2517        self.assertEqual(proc.rc, 10, proc)
2518
2519
2520class CAPITest(unittest.TestCase):
2521
2522    # Test PyUnicode_FromFormat()
2523    def test_from_format(self):
2524        support.import_module('ctypes')
2525        from ctypes import (
2526            c_char_p,
2527            pythonapi, py_object, sizeof,
2528            c_int, c_long, c_longlong, c_ssize_t,
2529            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2530        name = "PyUnicode_FromFormat"
2531        _PyUnicode_FromFormat = getattr(pythonapi, name)
2532        _PyUnicode_FromFormat.argtypes = (c_char_p,)
2533        _PyUnicode_FromFormat.restype = py_object
2534
2535        def PyUnicode_FromFormat(format, *args):
2536            cargs = tuple(
2537                py_object(arg) if isinstance(arg, str) else arg
2538                for arg in args)
2539            return _PyUnicode_FromFormat(format, *cargs)
2540
2541        def check_format(expected, format, *args):
2542            text = PyUnicode_FromFormat(format, *args)
2543            self.assertEqual(expected, text)
2544
2545        # ascii format, non-ascii argument
2546        check_format('ascii\x7f=unicode\xe9',
2547                     b'ascii\x7f=%U', 'unicode\xe9')
2548
2549        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2550        # raises an error
2551        self.assertRaisesRegex(ValueError,
2552            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2553            'string, got a non-ASCII byte: 0xe9$',
2554            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2555
2556        # test "%c"
2557        check_format('\uabcd',
2558                     b'%c', c_int(0xabcd))
2559        check_format('\U0010ffff',
2560                     b'%c', c_int(0x10ffff))
2561        with self.assertRaises(OverflowError):
2562            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2563        # Issue #18183
2564        check_format('\U00010000\U00100000',
2565                     b'%c%c', c_int(0x10000), c_int(0x100000))
2566
2567        # test "%"
2568        check_format('%',
2569                     b'%')
2570        check_format('%',
2571                     b'%%')
2572        check_format('%s',
2573                     b'%%s')
2574        check_format('[%]',
2575                     b'[%%]')
2576        check_format('%abc',
2577                     b'%%%s', b'abc')
2578
2579        # truncated string
2580        check_format('abc',
2581                     b'%.3s', b'abcdef')
2582        check_format('abc[\ufffd',
2583                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2584        check_format("'\\u20acABC'",
2585                     b'%A', '\u20acABC')
2586        check_format("'\\u20",
2587                     b'%.5A', '\u20acABCDEF')
2588        check_format("'\u20acABC'",
2589                     b'%R', '\u20acABC')
2590        check_format("'\u20acA",
2591                     b'%.3R', '\u20acABCDEF')
2592        check_format('\u20acAB',
2593                     b'%.3S', '\u20acABCDEF')
2594        check_format('\u20acAB',
2595                     b'%.3U', '\u20acABCDEF')
2596        check_format('\u20acAB',
2597                     b'%.3V', '\u20acABCDEF', None)
2598        check_format('abc[\ufffd',
2599                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2600
2601        # following tests comes from #7330
2602        # test width modifier and precision modifier with %S
2603        check_format("repr=  abc",
2604                     b'repr=%5S', 'abc')
2605        check_format("repr=ab",
2606                     b'repr=%.2S', 'abc')
2607        check_format("repr=   ab",
2608                     b'repr=%5.2S', 'abc')
2609
2610        # test width modifier and precision modifier with %R
2611        check_format("repr=   'abc'",
2612                     b'repr=%8R', 'abc')
2613        check_format("repr='ab",
2614                     b'repr=%.3R', 'abc')
2615        check_format("repr=  'ab",
2616                     b'repr=%5.3R', 'abc')
2617
2618        # test width modifier and precision modifier with %A
2619        check_format("repr=   'abc'",
2620                     b'repr=%8A', 'abc')
2621        check_format("repr='ab",
2622                     b'repr=%.3A', 'abc')
2623        check_format("repr=  'ab",
2624                     b'repr=%5.3A', 'abc')
2625
2626        # test width modifier and precision modifier with %s
2627        check_format("repr=  abc",
2628                     b'repr=%5s', b'abc')
2629        check_format("repr=ab",
2630                     b'repr=%.2s', b'abc')
2631        check_format("repr=   ab",
2632                     b'repr=%5.2s', b'abc')
2633
2634        # test width modifier and precision modifier with %U
2635        check_format("repr=  abc",
2636                     b'repr=%5U', 'abc')
2637        check_format("repr=ab",
2638                     b'repr=%.2U', 'abc')
2639        check_format("repr=   ab",
2640                     b'repr=%5.2U', 'abc')
2641
2642        # test width modifier and precision modifier with %V
2643        check_format("repr=  abc",
2644                     b'repr=%5V', 'abc', b'123')
2645        check_format("repr=ab",
2646                     b'repr=%.2V', 'abc', b'123')
2647        check_format("repr=   ab",
2648                     b'repr=%5.2V', 'abc', b'123')
2649        check_format("repr=  123",
2650                     b'repr=%5V', None, b'123')
2651        check_format("repr=12",
2652                     b'repr=%.2V', None, b'123')
2653        check_format("repr=   12",
2654                     b'repr=%5.2V', None, b'123')
2655
2656        # test integer formats (%i, %d, %u)
2657        check_format('010',
2658                     b'%03i', c_int(10))
2659        check_format('0010',
2660                     b'%0.4i', c_int(10))
2661        check_format('-123',
2662                     b'%i', c_int(-123))
2663        check_format('-123',
2664                     b'%li', c_long(-123))
2665        check_format('-123',
2666                     b'%lli', c_longlong(-123))
2667        check_format('-123',
2668                     b'%zi', c_ssize_t(-123))
2669
2670        check_format('-123',
2671                     b'%d', c_int(-123))
2672        check_format('-123',
2673                     b'%ld', c_long(-123))
2674        check_format('-123',
2675                     b'%lld', c_longlong(-123))
2676        check_format('-123',
2677                     b'%zd', c_ssize_t(-123))
2678
2679        check_format('123',
2680                     b'%u', c_uint(123))
2681        check_format('123',
2682                     b'%lu', c_ulong(123))
2683        check_format('123',
2684                     b'%llu', c_ulonglong(123))
2685        check_format('123',
2686                     b'%zu', c_size_t(123))
2687
2688        # test long output
2689        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2690        max_longlong = -min_longlong - 1
2691        check_format(str(min_longlong),
2692                     b'%lld', c_longlong(min_longlong))
2693        check_format(str(max_longlong),
2694                     b'%lld', c_longlong(max_longlong))
2695        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2696        check_format(str(max_ulonglong),
2697                     b'%llu', c_ulonglong(max_ulonglong))
2698        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2699
2700        # test padding (width and/or precision)
2701        check_format('123'.rjust(10, '0'),
2702                     b'%010i', c_int(123))
2703        check_format('123'.rjust(100),
2704                     b'%100i', c_int(123))
2705        check_format('123'.rjust(100, '0'),
2706                     b'%.100i', c_int(123))
2707        check_format('123'.rjust(80, '0').rjust(100),
2708                     b'%100.80i', c_int(123))
2709
2710        check_format('123'.rjust(10, '0'),
2711                     b'%010u', c_uint(123))
2712        check_format('123'.rjust(100),
2713                     b'%100u', c_uint(123))
2714        check_format('123'.rjust(100, '0'),
2715                     b'%.100u', c_uint(123))
2716        check_format('123'.rjust(80, '0').rjust(100),
2717                     b'%100.80u', c_uint(123))
2718
2719        check_format('123'.rjust(10, '0'),
2720                     b'%010x', c_int(0x123))
2721        check_format('123'.rjust(100),
2722                     b'%100x', c_int(0x123))
2723        check_format('123'.rjust(100, '0'),
2724                     b'%.100x', c_int(0x123))
2725        check_format('123'.rjust(80, '0').rjust(100),
2726                     b'%100.80x', c_int(0x123))
2727
2728        # test %A
2729        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2730                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2731
2732        # test %V
2733        check_format('repr=abc',
2734                     b'repr=%V', 'abc', b'xyz')
2735
2736        # Test string decode from parameter of %s using utf-8.
2737        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2738        # '\u4eba\u6c11'
2739        check_format('repr=\u4eba\u6c11',
2740                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2741
2742        #Test replace error handler.
2743        check_format('repr=abc\ufffd',
2744                     b'repr=%V', None, b'abc\xff')
2745
2746        # not supported: copy the raw format string. these tests are just here
2747        # to check for crashes and should not be considered as specifications
2748        check_format('%s',
2749                     b'%1%s', b'abc')
2750        check_format('%1abc',
2751                     b'%1abc')
2752        check_format('%+i',
2753                     b'%+i', c_int(10))
2754        check_format('%.%s',
2755                     b'%.%s', b'abc')
2756
2757        # Issue #33817: empty strings
2758        check_format('',
2759                     b'')
2760        check_format('',
2761                     b'%s', b'')
2762
2763    # Test PyUnicode_AsWideChar()
2764    @support.cpython_only
2765    def test_aswidechar(self):
2766        from _testcapi import unicode_aswidechar
2767        support.import_module('ctypes')
2768        from ctypes import c_wchar, sizeof
2769
2770        wchar, size = unicode_aswidechar('abcdef', 2)
2771        self.assertEqual(size, 2)
2772        self.assertEqual(wchar, 'ab')
2773
2774        wchar, size = unicode_aswidechar('abc', 3)
2775        self.assertEqual(size, 3)
2776        self.assertEqual(wchar, 'abc')
2777
2778        wchar, size = unicode_aswidechar('abc', 4)
2779        self.assertEqual(size, 3)
2780        self.assertEqual(wchar, 'abc\0')
2781
2782        wchar, size = unicode_aswidechar('abc', 10)
2783        self.assertEqual(size, 3)
2784        self.assertEqual(wchar, 'abc\0')
2785
2786        wchar, size = unicode_aswidechar('abc\0def', 20)
2787        self.assertEqual(size, 7)
2788        self.assertEqual(wchar, 'abc\0def\0')
2789
2790        nonbmp = chr(0x10ffff)
2791        if sizeof(c_wchar) == 2:
2792            buflen = 3
2793            nchar = 2
2794        else: # sizeof(c_wchar) == 4
2795            buflen = 2
2796            nchar = 1
2797        wchar, size = unicode_aswidechar(nonbmp, buflen)
2798        self.assertEqual(size, nchar)
2799        self.assertEqual(wchar, nonbmp + '\0')
2800
2801    # Test PyUnicode_AsWideCharString()
2802    @support.cpython_only
2803    def test_aswidecharstring(self):
2804        from _testcapi import unicode_aswidecharstring
2805        support.import_module('ctypes')
2806        from ctypes import c_wchar, sizeof
2807
2808        wchar, size = unicode_aswidecharstring('abc')
2809        self.assertEqual(size, 3)
2810        self.assertEqual(wchar, 'abc\0')
2811
2812        wchar, size = unicode_aswidecharstring('abc\0def')
2813        self.assertEqual(size, 7)
2814        self.assertEqual(wchar, 'abc\0def\0')
2815
2816        nonbmp = chr(0x10ffff)
2817        if sizeof(c_wchar) == 2:
2818            nchar = 2
2819        else: # sizeof(c_wchar) == 4
2820            nchar = 1
2821        wchar, size = unicode_aswidecharstring(nonbmp)
2822        self.assertEqual(size, nchar)
2823        self.assertEqual(wchar, nonbmp + '\0')
2824
2825    # Test PyUnicode_AsUCS4()
2826    @support.cpython_only
2827    def test_asucs4(self):
2828        from _testcapi import unicode_asucs4
2829        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2830                  'a\ud800b\udfffc', '\ud834\udd1e']:
2831            l = len(s)
2832            self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
2833            self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
2834            self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
2835            self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
2836            self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
2837            self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
2838            s = '\0'.join([s, s])
2839            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
2840            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
2841
2842    # Test PyUnicode_AsUTF8()
2843    @support.cpython_only
2844    def test_asutf8(self):
2845        from _testcapi import unicode_asutf8
2846
2847        bmp = '\u0100'
2848        bmp2 = '\uffff'
2849        nonbmp = chr(0x10ffff)
2850
2851        self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
2852        self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
2853        self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
2854        self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
2855
2856    # Test PyUnicode_AsUTF8AndSize()
2857    @support.cpython_only
2858    def test_asutf8andsize(self):
2859        from _testcapi import unicode_asutf8andsize
2860
2861        bmp = '\u0100'
2862        bmp2 = '\uffff'
2863        nonbmp = chr(0x10ffff)
2864
2865        self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
2866        self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
2867        self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
2868        self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
2869
2870    # Test PyUnicode_FindChar()
2871    @support.cpython_only
2872    def test_findchar(self):
2873        from _testcapi import unicode_findchar
2874
2875        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2876            for i, ch in enumerate(str):
2877                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2878                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2879
2880        str = "!>_<!"
2881        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2882        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2883        # start < end
2884        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2885        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2886        # start >= end
2887        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2888        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2889        # negative
2890        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2891        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2892
2893    # Test PyUnicode_CopyCharacters()
2894    @support.cpython_only
2895    def test_copycharacters(self):
2896        from _testcapi import unicode_copycharacters
2897
2898        strings = [
2899            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2900            '\u4f60\u597d\u4e16\u754c\uff01',
2901            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2902        ]
2903
2904        for idx, from_ in enumerate(strings):
2905            # wide -> narrow: exceed maxchar limitation
2906            for to in strings[:idx]:
2907                self.assertRaises(
2908                    SystemError,
2909                    unicode_copycharacters, to, 0, from_, 0, 5
2910                )
2911            # same kind
2912            for from_start in range(5):
2913                self.assertEqual(
2914                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2915                    (from_[from_start:from_start+5].ljust(5, '\0'),
2916                     5-from_start)
2917                )
2918            for to_start in range(5):
2919                self.assertEqual(
2920                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2921                    (from_[to_start:to_start+5].rjust(5, '\0'),
2922                     5-to_start)
2923                )
2924            # narrow -> wide
2925            # Tests omitted since this creates invalid strings.
2926
2927        s = strings[0]
2928        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2929        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2930        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2931        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2932        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2933        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2934        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2935
2936    @support.cpython_only
2937    def test_encode_decimal(self):
2938        from _testcapi import unicode_encodedecimal
2939        self.assertEqual(unicode_encodedecimal('123'),
2940                         b'123')
2941        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2942                         b'3.14')
2943        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2944                         b' 3.14 ')
2945        self.assertRaises(UnicodeEncodeError,
2946                          unicode_encodedecimal, "123\u20ac", "strict")
2947        self.assertRaisesRegex(
2948            ValueError,
2949            "^'decimal' codec can't encode character",
2950            unicode_encodedecimal, "123\u20ac", "replace")
2951
2952    @support.cpython_only
2953    def test_transform_decimal(self):
2954        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2955        self.assertEqual(transform_decimal('123'),
2956                         '123')
2957        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2958                         '3.14')
2959        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2960                         "\N{EM SPACE}3.14\N{EN SPACE}")
2961        self.assertEqual(transform_decimal('123\u20ac'),
2962                         '123\u20ac')
2963
2964    @support.cpython_only
2965    def test_pep393_utf8_caching_bug(self):
2966        # Issue #25709: Problem with string concatenation and utf-8 cache
2967        from _testcapi import getargs_s_hash
2968        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2969            s = ''
2970            for i in range(5):
2971                # Due to CPython specific optimization the 's' string can be
2972                # resized in-place.
2973                s += chr(k)
2974                # Parsing with the "s#" format code calls indirectly
2975                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2976                # encoded string cached in the Unicode object.
2977                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2978                # Check that the second call returns the same result
2979                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2980
2981class StringModuleTest(unittest.TestCase):
2982    def test_formatter_parser(self):
2983        def parse(format):
2984            return list(_string.formatter_parser(format))
2985
2986        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2987        self.assertEqual(formatter, [
2988            ('prefix ', '2', '', 's'),
2989            ('xxx', '0', '^+10.3f', None),
2990            ('', 'obj.attr', '', 's'),
2991            (' ', 'z[0]', '10', 's'),
2992        ])
2993
2994        formatter = parse("prefix {} suffix")
2995        self.assertEqual(formatter, [
2996            ('prefix ', '', '', None),
2997            (' suffix', None, None, None),
2998        ])
2999
3000        formatter = parse("str")
3001        self.assertEqual(formatter, [
3002            ('str', None, None, None),
3003        ])
3004
3005        formatter = parse("")
3006        self.assertEqual(formatter, [])
3007
3008        formatter = parse("{0}")
3009        self.assertEqual(formatter, [
3010            ('', '0', '', None),
3011        ])
3012
3013        self.assertRaises(TypeError, _string.formatter_parser, 1)
3014
3015    def test_formatter_field_name_split(self):
3016        def split(name):
3017            items = list(_string.formatter_field_name_split(name))
3018            items[1] = list(items[1])
3019            return items
3020        self.assertEqual(split("obj"), ["obj", []])
3021        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
3022        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
3023        self.assertEqual(split("obj.arg[key1][key2]"), [
3024            "obj",
3025            [(True, 'arg'),
3026             (False, 'key1'),
3027             (False, 'key2'),
3028            ]])
3029        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
3030
3031
3032if __name__ == "__main__":
3033    unittest.main()
3034