• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import sys
14import unittest
15import warnings
16from test import support, string_tests
17
18# Error handling (bad decoder return)
19def search_function(encoding):
20    def decode1(input, errors="strict"):
21        return 42 # not a tuple
22    def encode1(input, errors="strict"):
23        return 42 # not a tuple
24    def encode2(input, errors="strict"):
25        return (42, 42) # no unicode
26    def decode2(input, errors="strict"):
27        return (42, 42) # no unicode
28    if encoding=="test.unicode1":
29        return (encode1, decode1, None, None)
30    elif encoding=="test.unicode2":
31        return (encode2, decode2, None, None)
32    else:
33        return None
34codecs.register(search_function)
35
36def duplicate_string(text):
37    """
38    Try to get a fresh clone of the specified text:
39    new object with a reference count of 1.
40
41    This is a best-effort: latin1 single letters and the empty
42    string ('') are singletons and cannot be cloned.
43    """
44    return text.encode().decode()
45
46class StrSubclass(str):
47    pass
48
49class UnicodeTest(string_tests.CommonTest,
50        string_tests.MixinStrUnicodeUserStringTest,
51        string_tests.MixinStrUnicodeTest,
52        unittest.TestCase):
53
54    type2test = str
55
56    def checkequalnofix(self, result, object, methodname, *args):
57        method = getattr(object, methodname)
58        realresult = method(*args)
59        self.assertEqual(realresult, result)
60        self.assertTrue(type(realresult) is type(result))
61
62        # if the original is returned make sure that
63        # this doesn't happen with subclasses
64        if realresult is object:
65            class usub(str):
66                def __repr__(self):
67                    return 'usub(%r)' % str.__repr__(self)
68            object = usub(object)
69            method = getattr(object, methodname)
70            realresult = method(*args)
71            self.assertEqual(realresult, result)
72            self.assertTrue(object is not realresult)
73
74    def test_literals(self):
75        self.assertEqual('\xff', '\u00ff')
76        self.assertEqual('\uffff', '\U0000ffff')
77        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
78        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
79        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
80        # raw strings should not have unicode escapes
81        self.assertNotEqual(r"\u0020", " ")
82
83    def test_ascii(self):
84        if not sys.platform.startswith('java'):
85            # Test basic sanity of repr()
86            self.assertEqual(ascii('abc'), "'abc'")
87            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
88            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
89            self.assertEqual(ascii('\\c'), "'\\\\c'")
90            self.assertEqual(ascii('\\'), "'\\\\'")
91            self.assertEqual(ascii('\n'), "'\\n'")
92            self.assertEqual(ascii('\r'), "'\\r'")
93            self.assertEqual(ascii('\t'), "'\\t'")
94            self.assertEqual(ascii('\b'), "'\\x08'")
95            self.assertEqual(ascii("'\""), """'\\'"'""")
96            self.assertEqual(ascii("'\""), """'\\'"'""")
97            self.assertEqual(ascii("'"), '''"'"''')
98            self.assertEqual(ascii('"'), """'"'""")
99            latin1repr = (
100                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
101                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
102                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
103                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
104                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
105                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
106                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
107                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
108                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
109                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
110                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
111                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
112                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
113                "\\xfe\\xff'")
114            testrepr = ascii(''.join(map(chr, range(256))))
115            self.assertEqual(testrepr, latin1repr)
116            # Test ascii works on wide unicode escapes without overflow.
117            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
118                             ascii("\U00010000" * 39 + "\uffff" * 4096))
119
120            class WrongRepr:
121                def __repr__(self):
122                    return b'byte-repr'
123            self.assertRaises(TypeError, ascii, WrongRepr())
124
125    def test_repr(self):
126        if not sys.platform.startswith('java'):
127            # Test basic sanity of repr()
128            self.assertEqual(repr('abc'), "'abc'")
129            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
130            self.assertEqual(repr('ab\\'), "'ab\\\\'")
131            self.assertEqual(repr('\\c'), "'\\\\c'")
132            self.assertEqual(repr('\\'), "'\\\\'")
133            self.assertEqual(repr('\n'), "'\\n'")
134            self.assertEqual(repr('\r'), "'\\r'")
135            self.assertEqual(repr('\t'), "'\\t'")
136            self.assertEqual(repr('\b'), "'\\x08'")
137            self.assertEqual(repr("'\""), """'\\'"'""")
138            self.assertEqual(repr("'\""), """'\\'"'""")
139            self.assertEqual(repr("'"), '''"'"''')
140            self.assertEqual(repr('"'), """'"'""")
141            latin1repr = (
142                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
143                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
144                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
145                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
146                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
147                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
148                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
149                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
150                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
151                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
152                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
153                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
154                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
155                "\xfe\xff'")
156            testrepr = repr(''.join(map(chr, range(256))))
157            self.assertEqual(testrepr, latin1repr)
158            # Test repr works on wide unicode escapes without overflow.
159            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
160                             repr("\U00010000" * 39 + "\uffff" * 4096))
161
162            class WrongRepr:
163                def __repr__(self):
164                    return b'byte-repr'
165            self.assertRaises(TypeError, repr, WrongRepr())
166
167    def test_iterators(self):
168        # Make sure unicode objects have an __iter__ method
169        it = "\u1111\u2222\u3333".__iter__()
170        self.assertEqual(next(it), "\u1111")
171        self.assertEqual(next(it), "\u2222")
172        self.assertEqual(next(it), "\u3333")
173        self.assertRaises(StopIteration, next, it)
174
175    def test_count(self):
176        string_tests.CommonTest.test_count(self)
177        # check mixed argument types
178        self.checkequalnofix(3,  'aaa', 'count', 'a')
179        self.checkequalnofix(0,  'aaa', 'count', 'b')
180        self.checkequalnofix(3, 'aaa', 'count',  'a')
181        self.checkequalnofix(0, 'aaa', 'count',  'b')
182        self.checkequalnofix(0, 'aaa', 'count',  'b')
183        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
184        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
185        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
186        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
187        # test mixed kinds
188        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
189        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
190        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
191        self.checkequal(0, 'a' * 10, 'count', '\u0102')
192        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
193        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
194        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
195        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
196        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
197        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
198        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
199        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
200
201    def test_find(self):
202        string_tests.CommonTest.test_find(self)
203        # test implementation details of the memchr fast path
204        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
205        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
206        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
207        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
208        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
209        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
210        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
211        # check mixed argument types
212        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
213        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
214        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
215
216        self.assertRaises(TypeError, 'hello'.find)
217        self.assertRaises(TypeError, 'hello'.find, 42)
218        # test mixed kinds
219        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
220        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
221        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
222        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
223        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
224        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
225        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
226        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
227        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
228        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
229        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
230        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
231
232    def test_rfind(self):
233        string_tests.CommonTest.test_rfind(self)
234        # test implementation details of the memrchr fast path
235        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
236        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
237        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
238        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
239        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
240        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
241        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
242        # check mixed argument types
243        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
244        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
245        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
246        # test mixed kinds
247        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
248        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
249        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
250        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
251        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
252        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
253        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
254        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
255        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
256        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
257        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
258        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
259
260    def test_index(self):
261        string_tests.CommonTest.test_index(self)
262        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
263        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
264        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
265        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
266        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
267        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
268        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
269        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
270        # test mixed kinds
271        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
272        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
273        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
274        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
275        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
276        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
277        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
278        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
279        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
280        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
281        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
282        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
283
284    def test_rindex(self):
285        string_tests.CommonTest.test_rindex(self)
286        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
287        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
288        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
289        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
290
291        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
292        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
293        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
294        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
295        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
296        # test mixed kinds
297        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
298        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
299        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
300        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
301        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
302        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
303        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
304        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
305        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
306        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
307        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
308        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
309
310    def test_maketrans_translate(self):
311        # these work with plain translate()
312        self.checkequalnofix('bbbc', 'abababc', 'translate',
313                             {ord('a'): None})
314        self.checkequalnofix('iiic', 'abababc', 'translate',
315                             {ord('a'): None, ord('b'): ord('i')})
316        self.checkequalnofix('iiix', 'abababc', 'translate',
317                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
318        self.checkequalnofix('c', 'abababc', 'translate',
319                             {ord('a'): None, ord('b'): ''})
320        self.checkequalnofix('xyyx', 'xzx', 'translate',
321                             {ord('z'): 'yy'})
322
323        # this needs maketrans()
324        self.checkequalnofix('abababc', 'abababc', 'translate',
325                             {'b': '<i>'})
326        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
327        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
328        # test alternative way of calling maketrans()
329        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
330        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
331
332        # various tests switching from ASCII to latin1 or the opposite;
333        # same length, remove a letter, or replace with a longer string.
334        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
335                         "[X]")
336        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
337                         "[X]")
338        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
339                         "[]")
340        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
341                         "[XXX]")
342        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
343                         "[\xe9]")
344        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
345                         "x123")
346        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
347                         "x\xe9")
348
349        # test non-ASCII (don't take the fast-path)
350        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
351                         "[<\xe9>]")
352        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
353                         "[a]")
354        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
355                         "[]")
356        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
357                         "[123]")
358        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
359                         "[<\u20ac>\xe9]")
360
361        # invalid Unicode characters
362        invalid_char = 0x10ffff+1
363        for before in "a\xe9\u20ac\U0010ffff":
364            mapping = str.maketrans({before: invalid_char})
365            text = "[%s]" % before
366            self.assertRaises(ValueError, text.translate, mapping)
367
368        # errors
369        self.assertRaises(TypeError, self.type2test.maketrans)
370        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
371        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
372        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
373        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
374        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
375        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
376
377        self.assertRaises(TypeError, 'hello'.translate)
378        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
379
380    def test_split(self):
381        string_tests.CommonTest.test_split(self)
382
383        # test mixed kinds
384        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
385            left *= 9
386            right *= 9
387            for delim in ('c', '\u0102', '\U00010302'):
388                self.checkequal([left + right],
389                                left + right, 'split', delim)
390                self.checkequal([left, right],
391                                left + delim + right, 'split', delim)
392                self.checkequal([left + right],
393                                left + right, 'split', delim * 2)
394                self.checkequal([left, right],
395                                left + delim * 2 + right, 'split', delim *2)
396
397    def test_rsplit(self):
398        string_tests.CommonTest.test_rsplit(self)
399        # test mixed kinds
400        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
401            left *= 9
402            right *= 9
403            for delim in ('c', '\u0102', '\U00010302'):
404                self.checkequal([left + right],
405                                left + right, 'rsplit', delim)
406                self.checkequal([left, right],
407                                left + delim + right, 'rsplit', delim)
408                self.checkequal([left + right],
409                                left + right, 'rsplit', delim * 2)
410                self.checkequal([left, right],
411                                left + delim * 2 + right, 'rsplit', delim *2)
412
413    def test_partition(self):
414        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
415        # test mixed kinds
416        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
417        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
418            left *= 9
419            right *= 9
420            for delim in ('c', '\u0102', '\U00010302'):
421                self.checkequal((left + right, '', ''),
422                                left + right, 'partition', delim)
423                self.checkequal((left, delim, right),
424                                left + delim + right, 'partition', delim)
425                self.checkequal((left + right, '', ''),
426                                left + right, 'partition', delim * 2)
427                self.checkequal((left, delim * 2, right),
428                                left + delim * 2 + right, 'partition', delim * 2)
429
430    def test_rpartition(self):
431        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
432        # test mixed kinds
433        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
434        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
435            left *= 9
436            right *= 9
437            for delim in ('c', '\u0102', '\U00010302'):
438                self.checkequal(('', '', left + right),
439                                left + right, 'rpartition', delim)
440                self.checkequal((left, delim, right),
441                                left + delim + right, 'rpartition', delim)
442                self.checkequal(('', '', left + right),
443                                left + right, 'rpartition', delim * 2)
444                self.checkequal((left, delim * 2, right),
445                                left + delim * 2 + right, 'rpartition', delim * 2)
446
447    def test_join(self):
448        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
449
450        class MyWrapper:
451            def __init__(self, sval): self.sval = sval
452            def __str__(self): return self.sval
453
454        # mixed arguments
455        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
456        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
457        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
458        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
459        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
461        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
462        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
463        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
464        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
465        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
466
467    @unittest.skipIf(sys.maxsize > 2**32,
468        'needs too much memory on a 64-bit platform')
469    def test_join_overflow(self):
470        size = int(sys.maxsize**0.5) + 1
471        seq = ('A' * size,) * size
472        self.assertRaises(OverflowError, ''.join, seq)
473
474    def test_replace(self):
475        string_tests.CommonTest.test_replace(self)
476
477        # method call forwarded from str implementation because of unicode argument
478        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
479        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
480        # test mixed kinds
481        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
482            left *= 9
483            right *= 9
484            for delim in ('c', '\u0102', '\U00010302'):
485                for repl in ('d', '\u0103', '\U00010303'):
486                    self.checkequal(left + right,
487                                    left + right, 'replace', delim, repl)
488                    self.checkequal(left + repl + right,
489                                    left + delim + right,
490                                    'replace', delim, repl)
491                    self.checkequal(left + right,
492                                    left + right, 'replace', delim * 2, repl)
493                    self.checkequal(left + repl + right,
494                                    left + delim * 2 + right,
495                                    'replace', delim * 2, repl)
496
497    @support.cpython_only
498    def test_replace_id(self):
499        pattern = 'abc'
500        text = 'abc def'
501        self.assertIs(text.replace(pattern, pattern), text)
502
503    def test_bytes_comparison(self):
504        with support.check_warnings():
505            warnings.simplefilter('ignore', BytesWarning)
506            self.assertEqual('abc' == b'abc', False)
507            self.assertEqual('abc' != b'abc', True)
508            self.assertEqual('abc' == bytearray(b'abc'), False)
509            self.assertEqual('abc' != bytearray(b'abc'), True)
510
511    def test_comparison(self):
512        # Comparisons:
513        self.assertEqual('abc', 'abc')
514        self.assertTrue('abcd' > 'abc')
515        self.assertTrue('abc' < 'abcd')
516
517        if 0:
518            # Move these tests to a Unicode collation module test...
519            # Testing UTF-16 code point order comparisons...
520
521            # No surrogates, no fixup required.
522            self.assertTrue('\u0061' < '\u20ac')
523            # Non surrogate below surrogate value, no fixup required
524            self.assertTrue('\u0061' < '\ud800\udc02')
525
526            # Non surrogate above surrogate value, fixup required
527            def test_lecmp(s, s2):
528                self.assertTrue(s < s2)
529
530            def test_fixup(s):
531                s2 = '\ud800\udc01'
532                test_lecmp(s, s2)
533                s2 = '\ud900\udc01'
534                test_lecmp(s, s2)
535                s2 = '\uda00\udc01'
536                test_lecmp(s, s2)
537                s2 = '\udb00\udc01'
538                test_lecmp(s, s2)
539                s2 = '\ud800\udd01'
540                test_lecmp(s, s2)
541                s2 = '\ud900\udd01'
542                test_lecmp(s, s2)
543                s2 = '\uda00\udd01'
544                test_lecmp(s, s2)
545                s2 = '\udb00\udd01'
546                test_lecmp(s, s2)
547                s2 = '\ud800\ude01'
548                test_lecmp(s, s2)
549                s2 = '\ud900\ude01'
550                test_lecmp(s, s2)
551                s2 = '\uda00\ude01'
552                test_lecmp(s, s2)
553                s2 = '\udb00\ude01'
554                test_lecmp(s, s2)
555                s2 = '\ud800\udfff'
556                test_lecmp(s, s2)
557                s2 = '\ud900\udfff'
558                test_lecmp(s, s2)
559                s2 = '\uda00\udfff'
560                test_lecmp(s, s2)
561                s2 = '\udb00\udfff'
562                test_lecmp(s, s2)
563
564                test_fixup('\ue000')
565                test_fixup('\uff61')
566
567        # Surrogates on both sides, no fixup required
568        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
569
570    def test_islower(self):
571        super().test_islower()
572        self.checkequalnofix(False, '\u1FFc', 'islower')
573        self.assertFalse('\u2167'.islower())
574        self.assertTrue('\u2177'.islower())
575        # non-BMP, uppercase
576        self.assertFalse('\U00010401'.islower())
577        self.assertFalse('\U00010427'.islower())
578        # non-BMP, lowercase
579        self.assertTrue('\U00010429'.islower())
580        self.assertTrue('\U0001044E'.islower())
581        # non-BMP, non-cased
582        self.assertFalse('\U0001F40D'.islower())
583        self.assertFalse('\U0001F46F'.islower())
584
585    def test_isupper(self):
586        super().test_isupper()
587        if not sys.platform.startswith('java'):
588            self.checkequalnofix(False, '\u1FFc', 'isupper')
589        self.assertTrue('\u2167'.isupper())
590        self.assertFalse('\u2177'.isupper())
591        # non-BMP, uppercase
592        self.assertTrue('\U00010401'.isupper())
593        self.assertTrue('\U00010427'.isupper())
594        # non-BMP, lowercase
595        self.assertFalse('\U00010429'.isupper())
596        self.assertFalse('\U0001044E'.isupper())
597        # non-BMP, non-cased
598        self.assertFalse('\U0001F40D'.isupper())
599        self.assertFalse('\U0001F46F'.isupper())
600
601    def test_istitle(self):
602        super().test_istitle()
603        self.checkequalnofix(True, '\u1FFc', 'istitle')
604        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
605
606        # non-BMP, uppercase + lowercase
607        self.assertTrue('\U00010401\U00010429'.istitle())
608        self.assertTrue('\U00010427\U0001044E'.istitle())
609        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
610        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
611            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
612
613    def test_isspace(self):
614        super().test_isspace()
615        self.checkequalnofix(True, '\u2000', 'isspace')
616        self.checkequalnofix(True, '\u200a', 'isspace')
617        self.checkequalnofix(False, '\u2014', 'isspace')
618        # apparently there are no non-BMP spaces chars in Unicode 6
619        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
620                   '\U0001F40D', '\U0001F46F']:
621            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
622
623    def test_isalnum(self):
624        super().test_isalnum()
625        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
626                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
627            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
628
629    def test_isalpha(self):
630        super().test_isalpha()
631        self.checkequalnofix(True, '\u1FFc', 'isalpha')
632        # non-BMP, cased
633        self.assertTrue('\U00010401'.isalpha())
634        self.assertTrue('\U00010427'.isalpha())
635        self.assertTrue('\U00010429'.isalpha())
636        self.assertTrue('\U0001044E'.isalpha())
637        # non-BMP, non-cased
638        self.assertFalse('\U0001F40D'.isalpha())
639        self.assertFalse('\U0001F46F'.isalpha())
640
641    def test_isascii(self):
642        super().test_isascii()
643        self.assertFalse("\u20ac".isascii())
644        self.assertFalse("\U0010ffff".isascii())
645
646    def test_isdecimal(self):
647        self.checkequalnofix(False, '', 'isdecimal')
648        self.checkequalnofix(False, 'a', 'isdecimal')
649        self.checkequalnofix(True, '0', 'isdecimal')
650        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
651        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
652        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
653        self.checkequalnofix(True, '0123456789', 'isdecimal')
654        self.checkequalnofix(False, '0123456789a', 'isdecimal')
655
656        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
657
658        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
659                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
660            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
661        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
662            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
663
664    def test_isdigit(self):
665        super().test_isdigit()
666        self.checkequalnofix(True, '\u2460', 'isdigit')
667        self.checkequalnofix(False, '\xbc', 'isdigit')
668        self.checkequalnofix(True, '\u0660', 'isdigit')
669
670        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
671                   '\U0001F40D', '\U0001F46F', '\U00011065']:
672            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
673        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
674            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
675
676    def test_isnumeric(self):
677        self.checkequalnofix(False, '', 'isnumeric')
678        self.checkequalnofix(False, 'a', 'isnumeric')
679        self.checkequalnofix(True, '0', 'isnumeric')
680        self.checkequalnofix(True, '\u2460', 'isnumeric')
681        self.checkequalnofix(True, '\xbc', 'isnumeric')
682        self.checkequalnofix(True, '\u0660', 'isnumeric')
683        self.checkequalnofix(True, '0123456789', 'isnumeric')
684        self.checkequalnofix(False, '0123456789a', 'isnumeric')
685
686        self.assertRaises(TypeError, "abc".isnumeric, 42)
687
688        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
689                   '\U0001F40D', '\U0001F46F']:
690            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
691        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
692                   '\U000104A0', '\U0001F107']:
693            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
694
695    def test_isidentifier(self):
696        self.assertTrue("a".isidentifier())
697        self.assertTrue("Z".isidentifier())
698        self.assertTrue("_".isidentifier())
699        self.assertTrue("b0".isidentifier())
700        self.assertTrue("bc".isidentifier())
701        self.assertTrue("b_".isidentifier())
702        self.assertTrue("µ".isidentifier())
703        self.assertTrue("��������������".isidentifier())
704
705        self.assertFalse(" ".isidentifier())
706        self.assertFalse("[".isidentifier())
707        self.assertFalse("©".isidentifier())
708        self.assertFalse("0".isidentifier())
709
710    def test_isprintable(self):
711        self.assertTrue("".isprintable())
712        self.assertTrue(" ".isprintable())
713        self.assertTrue("abcdefg".isprintable())
714        self.assertFalse("abcdefg\n".isprintable())
715        # some defined Unicode character
716        self.assertTrue("\u0374".isprintable())
717        # undefined character
718        self.assertFalse("\u0378".isprintable())
719        # single surrogate character
720        self.assertFalse("\ud800".isprintable())
721
722        self.assertTrue('\U0001F46F'.isprintable())
723        self.assertFalse('\U000E0020'.isprintable())
724
725    def test_surrogates(self):
726        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
727                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
728            self.assertTrue(s.islower())
729            self.assertFalse(s.isupper())
730            self.assertFalse(s.istitle())
731        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
732                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
733            self.assertFalse(s.islower())
734            self.assertTrue(s.isupper())
735            self.assertTrue(s.istitle())
736
737        for meth_name in ('islower', 'isupper', 'istitle'):
738            meth = getattr(str, meth_name)
739            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
740                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
741
742        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
743                          'isdecimal', 'isnumeric',
744                          'isidentifier', 'isprintable'):
745            meth = getattr(str, meth_name)
746            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
747                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
748                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
749                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
750
751
752    def test_lower(self):
753        string_tests.CommonTest.test_lower(self)
754        self.assertEqual('\U00010427'.lower(), '\U0001044F')
755        self.assertEqual('\U00010427\U00010427'.lower(),
756                         '\U0001044F\U0001044F')
757        self.assertEqual('\U00010427\U0001044F'.lower(),
758                         '\U0001044F\U0001044F')
759        self.assertEqual('X\U00010427x\U0001044F'.lower(),
760                         'x\U0001044Fx\U0001044F')
761        self.assertEqual('fi'.lower(), 'fi')
762        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
763        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
764        self.assertEqual('\u03a3'.lower(), '\u03c3')
765        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
766        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
767        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
768        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
769        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
770        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
771        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
772        self.assertEqual('\u2177'.lower(), '\u2177')
773
774    def test_casefold(self):
775        self.assertEqual('hello'.casefold(), 'hello')
776        self.assertEqual('hELlo'.casefold(), 'hello')
777        self.assertEqual('ß'.casefold(), 'ss')
778        self.assertEqual('fi'.casefold(), 'fi')
779        self.assertEqual('\u03a3'.casefold(), '\u03c3')
780        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
781        self.assertEqual('\u00b5'.casefold(), '\u03bc')
782
783    def test_upper(self):
784        string_tests.CommonTest.test_upper(self)
785        self.assertEqual('\U0001044F'.upper(), '\U00010427')
786        self.assertEqual('\U0001044F\U0001044F'.upper(),
787                         '\U00010427\U00010427')
788        self.assertEqual('\U00010427\U0001044F'.upper(),
789                         '\U00010427\U00010427')
790        self.assertEqual('X\U00010427x\U0001044F'.upper(),
791                         'X\U00010427X\U00010427')
792        self.assertEqual('fi'.upper(), 'FI')
793        self.assertEqual('\u0130'.upper(), '\u0130')
794        self.assertEqual('\u03a3'.upper(), '\u03a3')
795        self.assertEqual('ß'.upper(), 'SS')
796        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
797        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
798        self.assertEqual('\u2177'.upper(), '\u2167')
799
800    def test_capitalize(self):
801        string_tests.CommonTest.test_capitalize(self)
802        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
803        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
804                         '\U00010427\U0001044F')
805        self.assertEqual('\U00010427\U0001044F'.capitalize(),
806                         '\U00010427\U0001044F')
807        self.assertEqual('\U0001044F\U00010427'.capitalize(),
808                         '\U00010427\U0001044F')
809        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
810                         'X\U0001044Fx\U0001044F')
811        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
812        exp = '\u0399\u0308\u0300\u0069\u0307'
813        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
814        self.assertEqual('finnish'.capitalize(), 'FInnish')
815        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
816
817    def test_title(self):
818        super().test_title()
819        self.assertEqual('\U0001044F'.title(), '\U00010427')
820        self.assertEqual('\U0001044F\U0001044F'.title(),
821                         '\U00010427\U0001044F')
822        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
823                         '\U00010427\U0001044F \U00010427\U0001044F')
824        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
825                         '\U00010427\U0001044F \U00010427\U0001044F')
826        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
827                         '\U00010427\U0001044F \U00010427\U0001044F')
828        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
829                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
830        self.assertEqual('fiNNISH'.title(), 'Finnish')
831        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
832        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
833
834    def test_swapcase(self):
835        string_tests.CommonTest.test_swapcase(self)
836        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
837        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
838        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
839                         '\U00010427\U00010427')
840        self.assertEqual('\U00010427\U0001044F'.swapcase(),
841                         '\U0001044F\U00010427')
842        self.assertEqual('\U0001044F\U00010427'.swapcase(),
843                         '\U00010427\U0001044F')
844        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
845                         'x\U0001044FX\U00010427')
846        self.assertEqual('fi'.swapcase(), 'FI')
847        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
848        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
849        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
850        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
851        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
852        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
853        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
854        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
855        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
856        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
857        self.assertEqual('ß'.swapcase(), 'SS')
858        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
859
860    def test_center(self):
861        string_tests.CommonTest.test_center(self)
862        self.assertEqual('x'.center(2, '\U0010FFFF'),
863                         'x\U0010FFFF')
864        self.assertEqual('x'.center(3, '\U0010FFFF'),
865                         '\U0010FFFFx\U0010FFFF')
866        self.assertEqual('x'.center(4, '\U0010FFFF'),
867                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
868
869    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
870    @support.cpython_only
871    def test_case_operation_overflow(self):
872        # Issue #22643
873        size = 2**32//12 + 1
874        try:
875            s = "ü" * size
876        except MemoryError:
877            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
878        try:
879            self.assertRaises(OverflowError, s.upper)
880        finally:
881            del s
882
883    def test_contains(self):
884        # Testing Unicode contains method
885        self.assertIn('a', 'abdb')
886        self.assertIn('a', 'bdab')
887        self.assertIn('a', 'bdaba')
888        self.assertIn('a', 'bdba')
889        self.assertNotIn('a', 'bdb')
890        self.assertIn('a', 'bdba')
891        self.assertIn('a', ('a',1,None))
892        self.assertIn('a', (1,None,'a'))
893        self.assertIn('a', ('a',1,None))
894        self.assertIn('a', (1,None,'a'))
895        self.assertNotIn('a', ('x',1,'y'))
896        self.assertNotIn('a', ('x',1,None))
897        self.assertNotIn('abcd', 'abcxxxx')
898        self.assertIn('ab', 'abcd')
899        self.assertIn('ab', 'abc')
900        self.assertIn('ab', (1,None,'ab'))
901        self.assertIn('', 'abc')
902        self.assertIn('', '')
903        self.assertIn('', 'abc')
904        self.assertNotIn('\0', 'abc')
905        self.assertIn('\0', '\0abc')
906        self.assertIn('\0', 'abc\0')
907        self.assertIn('a', '\0abc')
908        self.assertIn('asdf', 'asdf')
909        self.assertNotIn('asdf', 'asd')
910        self.assertNotIn('asdf', '')
911
912        self.assertRaises(TypeError, "abc".__contains__)
913        # test mixed kinds
914        for fill in ('a', '\u0100', '\U00010300'):
915            fill *= 9
916            for delim in ('c', '\u0102', '\U00010302'):
917                self.assertNotIn(delim, fill)
918                self.assertIn(delim, fill + delim)
919                self.assertNotIn(delim * 2, fill)
920                self.assertIn(delim * 2, fill + delim * 2)
921
922    def test_issue18183(self):
923        '\U00010000\U00100000'.lower()
924        '\U00010000\U00100000'.casefold()
925        '\U00010000\U00100000'.upper()
926        '\U00010000\U00100000'.capitalize()
927        '\U00010000\U00100000'.title()
928        '\U00010000\U00100000'.swapcase()
929        '\U00100000'.center(3, '\U00010000')
930        '\U00100000'.ljust(3, '\U00010000')
931        '\U00100000'.rjust(3, '\U00010000')
932
933    def test_format(self):
934        self.assertEqual(''.format(), '')
935        self.assertEqual('a'.format(), 'a')
936        self.assertEqual('ab'.format(), 'ab')
937        self.assertEqual('a{{'.format(), 'a{')
938        self.assertEqual('a}}'.format(), 'a}')
939        self.assertEqual('{{b'.format(), '{b')
940        self.assertEqual('}}b'.format(), '}b')
941        self.assertEqual('a{{b'.format(), 'a{b')
942
943        # examples from the PEP:
944        import datetime
945        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
946        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
947                         "My name is Fred")
948        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
949                         "My name is Fred :-{}")
950
951        d = datetime.date(2007, 8, 18)
952        self.assertEqual("The year is {0.year}".format(d),
953                         "The year is 2007")
954
955        # classes we'll use for testing
956        class C:
957            def __init__(self, x=100):
958                self._x = x
959            def __format__(self, spec):
960                return spec
961
962        class D:
963            def __init__(self, x):
964                self.x = x
965            def __format__(self, spec):
966                return str(self.x)
967
968        # class with __str__, but no __format__
969        class E:
970            def __init__(self, x):
971                self.x = x
972            def __str__(self):
973                return 'E(' + self.x + ')'
974
975        # class with __repr__, but no __format__ or __str__
976        class F:
977            def __init__(self, x):
978                self.x = x
979            def __repr__(self):
980                return 'F(' + self.x + ')'
981
982        # class with __format__ that forwards to string, for some format_spec's
983        class G:
984            def __init__(self, x):
985                self.x = x
986            def __str__(self):
987                return "string is " + self.x
988            def __format__(self, format_spec):
989                if format_spec == 'd':
990                    return 'G(' + self.x + ')'
991                return object.__format__(self, format_spec)
992
993        class I(datetime.date):
994            def __format__(self, format_spec):
995                return self.strftime(format_spec)
996
997        class J(int):
998            def __format__(self, format_spec):
999                return int.__format__(self * 2, format_spec)
1000
1001        class M:
1002            def __init__(self, x):
1003                self.x = x
1004            def __repr__(self):
1005                return 'M(' + self.x + ')'
1006            __str__ = None
1007
1008        class N:
1009            def __init__(self, x):
1010                self.x = x
1011            def __repr__(self):
1012                return 'N(' + self.x + ')'
1013            __format__ = None
1014
1015        self.assertEqual(''.format(), '')
1016        self.assertEqual('abc'.format(), 'abc')
1017        self.assertEqual('{0}'.format('abc'), 'abc')
1018        self.assertEqual('{0:}'.format('abc'), 'abc')
1019#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1020        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1021        self.assertEqual('{0}X'.format('abc'), 'abcX')
1022        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1023        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1024        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1025        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1026        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1027        self.assertEqual('{0}'.format(-15), '-15')
1028        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1029        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1030        self.assertEqual('{{'.format(), '{')
1031        self.assertEqual('}}'.format(), '}')
1032        self.assertEqual('{{}}'.format(), '{}')
1033        self.assertEqual('{{x}}'.format(), '{x}')
1034        self.assertEqual('{{{0}}}'.format(123), '{123}')
1035        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1036        self.assertEqual('}}{{'.format(), '}{')
1037        self.assertEqual('}}x{{'.format(), '}x{')
1038
1039        # weird field names
1040        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1041        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1042        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1043
1044        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1045        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1046        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1047        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1048        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1049        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1050        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1051
1052        # strings
1053        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1054        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1055        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1056        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1057        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1058        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1059        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1060        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1061        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1062        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1063        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1064        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1065        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1066        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1067        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1068        self.assertEqual('{0:>7s}'.format('result'), ' result')
1069        self.assertEqual('{0:>8s}'.format('result'), '  result')
1070        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1071        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1072        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1073        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1074        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1075        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1076
1077        # issue 12546: use \x00 as a fill character
1078        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1079        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1080        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1081        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1082
1083        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1084        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1085        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1086        self.assertEqual('{0:<6}'.format(3), '3     ')
1087
1088        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1089        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1090        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1091        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1092
1093        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1094        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1095        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1096        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1097
1098        # format specifiers for user defined type
1099        self.assertEqual('{0:abc}'.format(C()), 'abc')
1100
1101        # !r, !s and !a coercions
1102        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1103        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1104        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1105        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1106        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1107        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1108        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1109        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1110        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1111        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1112        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1113        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1114        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1115        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1116        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1117        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1118
1119        # test fallback to object.__format__
1120        self.assertEqual('{0}'.format({}), '{}')
1121        self.assertEqual('{0}'.format([]), '[]')
1122        self.assertEqual('{0}'.format([1]), '[1]')
1123
1124        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1125        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1126
1127        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1128        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1129        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1130
1131        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1132                                                       month=8,
1133                                                       day=27)),
1134                         "date: 2007-08-27")
1135
1136        # test deriving from a builtin type and overriding __format__
1137        self.assertEqual("{0}".format(J(10)), "20")
1138
1139
1140        # string format specifiers
1141        self.assertEqual('{0:}'.format('a'), 'a')
1142
1143        # computed format specifiers
1144        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1145        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1146        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1147        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1148        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1149
1150        # test various errors
1151        self.assertRaises(ValueError, '{'.format)
1152        self.assertRaises(ValueError, '}'.format)
1153        self.assertRaises(ValueError, 'a{'.format)
1154        self.assertRaises(ValueError, 'a}'.format)
1155        self.assertRaises(ValueError, '{a'.format)
1156        self.assertRaises(ValueError, '}a'.format)
1157        self.assertRaises(IndexError, '{0}'.format)
1158        self.assertRaises(IndexError, '{1}'.format, 'abc')
1159        self.assertRaises(KeyError,   '{x}'.format)
1160        self.assertRaises(ValueError, "}{".format)
1161        self.assertRaises(ValueError, "abc{0:{}".format)
1162        self.assertRaises(ValueError, "{0".format)
1163        self.assertRaises(IndexError, "{0.}".format)
1164        self.assertRaises(ValueError, "{0.}".format, 0)
1165        self.assertRaises(ValueError, "{0[}".format)
1166        self.assertRaises(ValueError, "{0[}".format, [])
1167        self.assertRaises(KeyError,   "{0]}".format)
1168        self.assertRaises(ValueError, "{0.[]}".format, 0)
1169        self.assertRaises(ValueError, "{0..foo}".format, 0)
1170        self.assertRaises(ValueError, "{0[0}".format, 0)
1171        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1172        self.assertRaises(KeyError,   "{c]}".format)
1173        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1174        self.assertRaises(ValueError, "{0}}".format, 0)
1175        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1176        self.assertRaises(ValueError, "{0!x}".format, 3)
1177        self.assertRaises(ValueError, "{0!}".format, 0)
1178        self.assertRaises(ValueError, "{0!rs}".format, 0)
1179        self.assertRaises(ValueError, "{!}".format)
1180        self.assertRaises(IndexError, "{:}".format)
1181        self.assertRaises(IndexError, "{:s}".format)
1182        self.assertRaises(IndexError, "{}".format)
1183        big = "23098475029384702983476098230754973209482573"
1184        self.assertRaises(ValueError, ("{" + big + "}").format)
1185        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1186
1187        # issue 6089
1188        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1189        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1190
1191        # can't have a replacement on the field name portion
1192        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1193
1194        # exceed maximum recursion depth
1195        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1196        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1197                          0, 1, 2, 3, 4, 5, 6, 7)
1198
1199        # string format spec errors
1200        self.assertRaises(ValueError, "{0:-s}".format, '')
1201        self.assertRaises(ValueError, format, "", "-")
1202        self.assertRaises(ValueError, "{0:=s}".format, '')
1203
1204        # Alternate formatting is not supported
1205        self.assertRaises(ValueError, format, '', '#')
1206        self.assertRaises(ValueError, format, '', '#20')
1207
1208        # Non-ASCII
1209        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1210                         'ABC\u0410\u0411\u0412')
1211        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1212                         'ABC')
1213        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1214                         '')
1215
1216        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1217        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1218        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1219        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1220        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1221        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1222        self.assertRaises(ValueError, "{a{}b}".format, 42)
1223        self.assertRaises(ValueError, "{a{b}".format, 42)
1224        self.assertRaises(ValueError, "{[}".format, 42)
1225
1226        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1227
1228        # Blocking fallback
1229        m = M('data')
1230        self.assertEqual("{!r}".format(m), 'M(data)')
1231        self.assertRaises(TypeError, "{!s}".format, m)
1232        self.assertRaises(TypeError, "{}".format, m)
1233        n = N('data')
1234        self.assertEqual("{!r}".format(n), 'N(data)')
1235        self.assertEqual("{!s}".format(n), 'N(data)')
1236        self.assertRaises(TypeError, "{}".format, n)
1237
1238    def test_format_map(self):
1239        self.assertEqual(''.format_map({}), '')
1240        self.assertEqual('a'.format_map({}), 'a')
1241        self.assertEqual('ab'.format_map({}), 'ab')
1242        self.assertEqual('a{{'.format_map({}), 'a{')
1243        self.assertEqual('a}}'.format_map({}), 'a}')
1244        self.assertEqual('{{b'.format_map({}), '{b')
1245        self.assertEqual('}}b'.format_map({}), '}b')
1246        self.assertEqual('a{{b'.format_map({}), 'a{b')
1247
1248        # using mappings
1249        class Mapping(dict):
1250            def __missing__(self, key):
1251                return key
1252        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1253        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1254
1255        class InternalMapping:
1256            def __init__(self):
1257                self.mapping = {'a': 'hello'}
1258            def __getitem__(self, key):
1259                return self.mapping[key]
1260        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1261
1262
1263        class C:
1264            def __init__(self, x=100):
1265                self._x = x
1266            def __format__(self, spec):
1267                return spec
1268        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1269
1270        # test various errors
1271        self.assertRaises(TypeError, ''.format_map)
1272        self.assertRaises(TypeError, 'a'.format_map)
1273
1274        self.assertRaises(ValueError, '{'.format_map, {})
1275        self.assertRaises(ValueError, '}'.format_map, {})
1276        self.assertRaises(ValueError, 'a{'.format_map, {})
1277        self.assertRaises(ValueError, 'a}'.format_map, {})
1278        self.assertRaises(ValueError, '{a'.format_map, {})
1279        self.assertRaises(ValueError, '}a'.format_map, {})
1280
1281        # issue #12579: can't supply positional params to format_map
1282        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1283        self.assertRaises(ValueError, '{}'.format_map, 'a')
1284        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1285
1286        class BadMapping:
1287            def __getitem__(self, key):
1288                return 1/0
1289        self.assertRaises(KeyError, '{a}'.format_map, {})
1290        self.assertRaises(TypeError, '{a}'.format_map, [])
1291        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1292
1293    def test_format_huge_precision(self):
1294        format_string = ".{}f".format(sys.maxsize + 1)
1295        with self.assertRaises(ValueError):
1296            result = format(2.34, format_string)
1297
1298    def test_format_huge_width(self):
1299        format_string = "{}f".format(sys.maxsize + 1)
1300        with self.assertRaises(ValueError):
1301            result = format(2.34, format_string)
1302
1303    def test_format_huge_item_number(self):
1304        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1305        with self.assertRaises(ValueError):
1306            result = format_string.format(2.34)
1307
1308    def test_format_auto_numbering(self):
1309        class C:
1310            def __init__(self, x=100):
1311                self._x = x
1312            def __format__(self, spec):
1313                return spec
1314
1315        self.assertEqual('{}'.format(10), '10')
1316        self.assertEqual('{:5}'.format('s'), 's    ')
1317        self.assertEqual('{!r}'.format('s'), "'s'")
1318        self.assertEqual('{._x}'.format(C(10)), '10')
1319        self.assertEqual('{[1]}'.format([1, 2]), '2')
1320        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1321        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1322
1323        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1324        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1325
1326        # can't mix and match numbering and auto-numbering
1327        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1328        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1329        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1330        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1331
1332        # can mix and match auto-numbering and named
1333        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1334        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1335        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1336        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1337
1338    def test_formatting(self):
1339        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1340        # Testing Unicode formatting strings...
1341        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1342        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1343        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1344        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1345        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1346        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1347        if not sys.platform.startswith('java'):
1348            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1349            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1350            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1351        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1352        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1353
1354        self.assertEqual('%c' % 0x1234, '\u1234')
1355        self.assertEqual('%c' % 0x21483, '\U00021483')
1356        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1357        self.assertEqual('%c' % '\U00021483', '\U00021483')
1358        self.assertRaises(TypeError, "%c".__mod__, "aa")
1359        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1360        self.assertRaises(TypeError, "%i".__mod__, "aa")
1361
1362        # formatting jobs delegated from the string implementation:
1363        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1364        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1365        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1366        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1367        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1368        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1369        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1370        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1371        self.assertEqual('...%s...' % "abc", '...abc...')
1372        self.assertEqual('%*s' % (5,'abc',), '  abc')
1373        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1374        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1375        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1376        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1377        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1378        self.assertEqual('%c' % 'a', 'a')
1379        class Wrapper:
1380            def __str__(self):
1381                return '\u1234'
1382        self.assertEqual('%s' % Wrapper(), '\u1234')
1383
1384        # issue 3382
1385        NAN = float('nan')
1386        INF = float('inf')
1387        self.assertEqual('%f' % NAN, 'nan')
1388        self.assertEqual('%F' % NAN, 'NAN')
1389        self.assertEqual('%f' % INF, 'inf')
1390        self.assertEqual('%F' % INF, 'INF')
1391
1392        # PEP 393
1393        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1394        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1395
1396        #issue 19995
1397        class PseudoInt:
1398            def __init__(self, value):
1399                self.value = int(value)
1400            def __int__(self):
1401                return self.value
1402            def __index__(self):
1403                return self.value
1404        class PseudoFloat:
1405            def __init__(self, value):
1406                self.value = float(value)
1407            def __int__(self):
1408                return int(self.value)
1409        pi = PseudoFloat(3.1415)
1410        letter_m = PseudoInt(109)
1411        self.assertEqual('%x' % 42, '2a')
1412        self.assertEqual('%X' % 15, 'F')
1413        self.assertEqual('%o' % 9, '11')
1414        self.assertEqual('%c' % 109, 'm')
1415        self.assertEqual('%x' % letter_m, '6d')
1416        self.assertEqual('%X' % letter_m, '6D')
1417        self.assertEqual('%o' % letter_m, '155')
1418        self.assertEqual('%c' % letter_m, 'm')
1419        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1420        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1421        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1422        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1423        self.assertRaises(TypeError, operator.mod, '%c', pi),
1424
1425    def test_formatting_with_enum(self):
1426        # issue18780
1427        import enum
1428        class Float(float, enum.Enum):
1429            PI = 3.1415926
1430        class Int(enum.IntEnum):
1431            IDES = 15
1432        class Str(str, enum.Enum):
1433            ABC = 'abc'
1434        # Testing Unicode formatting strings...
1435        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1436                         'Str.ABC, Str.ABC')
1437        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1438                        (Str.ABC, Str.ABC,
1439                         Int.IDES, Int.IDES, Int.IDES,
1440                         Float.PI, Float.PI),
1441                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1442
1443        # formatting jobs delegated from the string implementation:
1444        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1445                         '...Str.ABC...')
1446        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1447                         '...Int.IDES...')
1448        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1449                         '...15...')
1450        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1451                         '...15...')
1452        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1453                         '...15...')
1454        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1455                         '...3.141593...')
1456
1457    def test_formatting_huge_precision(self):
1458        format_string = "%.{}f".format(sys.maxsize + 1)
1459        with self.assertRaises(ValueError):
1460            result = format_string % 2.34
1461
1462    def test_issue28598_strsubclass_rhs(self):
1463        # A subclass of str with an __rmod__ method should be able to hook
1464        # into the % operator
1465        class SubclassedStr(str):
1466            def __rmod__(self, other):
1467                return 'Success, self.__rmod__({!r}) was called'.format(other)
1468        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1469                         "Success, self.__rmod__('lhs %% %r') was called")
1470
1471    @support.cpython_only
1472    def test_formatting_huge_precision_c_limits(self):
1473        from _testcapi import INT_MAX
1474        format_string = "%.{}f".format(INT_MAX + 1)
1475        with self.assertRaises(ValueError):
1476            result = format_string % 2.34
1477
1478    def test_formatting_huge_width(self):
1479        format_string = "%{}f".format(sys.maxsize + 1)
1480        with self.assertRaises(ValueError):
1481            result = format_string % 2.34
1482
1483    def test_startswith_endswith_errors(self):
1484        for meth in ('foo'.startswith, 'foo'.endswith):
1485            with self.assertRaises(TypeError) as cm:
1486                meth(['f'])
1487            exc = str(cm.exception)
1488            self.assertIn('str', exc)
1489            self.assertIn('tuple', exc)
1490
1491    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1492    def test_format_float(self):
1493        # should not format with a comma, but always with C locale
1494        self.assertEqual('1.0', '%.1f' % 1.0)
1495
1496    def test_constructor(self):
1497        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1498
1499        self.assertEqual(
1500            str('unicode remains unicode'),
1501            'unicode remains unicode'
1502        )
1503
1504        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1505            subclass = StrSubclass(text)
1506            self.assertEqual(str(subclass), text)
1507            self.assertEqual(len(subclass), len(text))
1508            if text == 'ascii':
1509                self.assertEqual(subclass.encode('ascii'), b'ascii')
1510                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1511
1512        self.assertEqual(
1513            str('strings are converted to unicode'),
1514            'strings are converted to unicode'
1515        )
1516
1517        class StringCompat:
1518            def __init__(self, x):
1519                self.x = x
1520            def __str__(self):
1521                return self.x
1522
1523        self.assertEqual(
1524            str(StringCompat('__str__ compatible objects are recognized')),
1525            '__str__ compatible objects are recognized'
1526        )
1527
1528        # unicode(obj) is compatible to str():
1529
1530        o = StringCompat('unicode(obj) is compatible to str()')
1531        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1532        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1533
1534        for obj in (123, 123.45, 123):
1535            self.assertEqual(str(obj), str(str(obj)))
1536
1537        # unicode(obj, encoding, error) tests (this maps to
1538        # PyUnicode_FromEncodedObject() at C level)
1539
1540        if not sys.platform.startswith('java'):
1541            self.assertRaises(
1542                TypeError,
1543                str,
1544                'decoding unicode is not supported',
1545                'utf-8',
1546                'strict'
1547            )
1548
1549        self.assertEqual(
1550            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1551            'strings are decoded to unicode'
1552        )
1553
1554        if not sys.platform.startswith('java'):
1555            self.assertEqual(
1556                str(
1557                    memoryview(b'character buffers are decoded to unicode'),
1558                    'utf-8',
1559                    'strict'
1560                ),
1561                'character buffers are decoded to unicode'
1562            )
1563
1564        self.assertRaises(TypeError, str, 42, 42, 42)
1565
1566    def test_constructor_keyword_args(self):
1567        """Pass various keyword argument combinations to the constructor."""
1568        # The object argument can be passed as a keyword.
1569        self.assertEqual(str(object='foo'), 'foo')
1570        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1571        # The errors argument without encoding triggers "decode" mode.
1572        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1573        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1574
1575    def test_constructor_defaults(self):
1576        """Check the constructor argument defaults."""
1577        # The object argument defaults to '' or b''.
1578        self.assertEqual(str(), '')
1579        self.assertEqual(str(errors='strict'), '')
1580        utf8_cent = '¢'.encode('utf-8')
1581        # The encoding argument defaults to utf-8.
1582        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1583        # The errors argument defaults to strict.
1584        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1585
1586    def test_codecs_utf7(self):
1587        utfTests = [
1588            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1589            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1590            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1591            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1592            ('+', b'+-'),
1593            ('+-', b'+--'),
1594            ('+?', b'+-?'),
1595            (r'\?', b'+AFw?'),
1596            ('+?', b'+-?'),
1597            (r'\\?', b'+AFwAXA?'),
1598            (r'\\\?', b'+AFwAXABc?'),
1599            (r'++--', b'+-+---'),
1600            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1601            ('/', b'/'),
1602        ]
1603
1604        for (x, y) in utfTests:
1605            self.assertEqual(x.encode('utf-7'), y)
1606
1607        # Unpaired surrogates are passed through
1608        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1609        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1610        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1611        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1612        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1613        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1614        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1615        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1616
1617        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1618        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1619
1620        # Issue #2242: crash on some Windows/MSVC versions
1621        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1622
1623        # Direct encoded characters
1624        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1625        # Optional direct characters
1626        set_o = '!"#$%&*;<=>@[]^_`{|}'
1627        for c in set_d:
1628            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1629            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1630        for c in set_o:
1631            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1632
1633    def test_codecs_utf8(self):
1634        self.assertEqual(''.encode('utf-8'), b'')
1635        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1636        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1637        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1638        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1639        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1640        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1641                         b'\xf0\x90\x80\x82'*10)
1642        self.assertEqual(
1643            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1644            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1645            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1646            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1647            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1648            ' Nunstuck git und'.encode('utf-8'),
1649            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1650            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1651            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1652            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1653            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1654            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1655            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1656            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1657            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1658            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1659        )
1660
1661        # UTF-8 specific decoding tests
1662        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1663        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1664        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1665
1666        # Other possible utf-8 test cases:
1667        # * strict decoding testing for all of the
1668        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1669
1670    def test_utf8_decode_valid_sequences(self):
1671        sequences = [
1672            # single byte
1673            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1674            # 2 bytes
1675            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1676            # 3 bytes
1677            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1678            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1679            # 4 bytes
1680            (b'\xF0\x90\x80\x80', '\U00010000'),
1681            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1682        ]
1683        for seq, res in sequences:
1684            self.assertEqual(seq.decode('utf-8'), res)
1685
1686
1687    def test_utf8_decode_invalid_sequences(self):
1688        # continuation bytes in a sequence of 2, 3, or 4 bytes
1689        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1690        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1691        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1692        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1693        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1694        invalid_start_bytes = (
1695            continuation_bytes + invalid_2B_seq_start_bytes +
1696            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1697        )
1698
1699        for byte in invalid_start_bytes:
1700            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1701
1702        for sb in invalid_2B_seq_start_bytes:
1703            for cb in continuation_bytes:
1704                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1705
1706        for sb in invalid_4B_seq_start_bytes:
1707            for cb1 in continuation_bytes[:3]:
1708                for cb3 in continuation_bytes[:3]:
1709                    self.assertRaises(UnicodeDecodeError,
1710                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1711
1712        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1713            self.assertRaises(UnicodeDecodeError,
1714                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1715            self.assertRaises(UnicodeDecodeError,
1716                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1717        # surrogates
1718        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1719            self.assertRaises(UnicodeDecodeError,
1720                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1721            self.assertRaises(UnicodeDecodeError,
1722                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1723        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1724            self.assertRaises(UnicodeDecodeError,
1725                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1726            self.assertRaises(UnicodeDecodeError,
1727                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1728        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1729            self.assertRaises(UnicodeDecodeError,
1730                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1731            self.assertRaises(UnicodeDecodeError,
1732                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1733
1734    def test_issue8271(self):
1735        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1736        # only the start byte and the continuation byte(s) are now considered
1737        # invalid, instead of the number of bytes specified by the start byte.
1738        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1739        # table 3-8, Row 2) for more information about the algorithm used.
1740        FFFD = '\ufffd'
1741        sequences = [
1742            # invalid start bytes
1743            (b'\x80', FFFD), # continuation byte
1744            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1745            (b'\xc0', FFFD),
1746            (b'\xc0\xc0', FFFD*2),
1747            (b'\xc1', FFFD),
1748            (b'\xc1\xc0', FFFD*2),
1749            (b'\xc0\xc1', FFFD*2),
1750            # with start byte of a 2-byte sequence
1751            (b'\xc2', FFFD), # only the start byte
1752            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1753            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1754            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1755            # with start byte of a 3-byte sequence
1756            (b'\xe1', FFFD), # only the start byte
1757            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1758            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1759            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1760            (b'\xe1\x80', FFFD), # only 1 continuation byte
1761            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1762            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1763            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1764            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1765            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1766            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1767            # with start byte of a 4-byte sequence
1768            (b'\xf1', FFFD), # only the start byte
1769            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1770            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1771            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1772            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1773            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1774            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1775            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1776            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1777            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1778            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1779            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1780            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1781            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1782            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1783            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1784            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1785            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1786            # with invalid start byte of a 4-byte sequence (rfc2279)
1787            (b'\xf5', FFFD), # only the start byte
1788            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1789            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1790            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1791            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1792            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1793            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1794            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1795            # with invalid start byte of a 5-byte sequence (rfc2279)
1796            (b'\xf8', FFFD), # only the start byte
1797            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1798            (b'\xf8\x80', FFFD*2), # only one continuation byte
1799            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1800            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1801            # with invalid start byte of a 6-byte sequence (rfc2279)
1802            (b'\xfc', FFFD), # only the start byte
1803            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1804            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1805            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1806            # invalid start byte
1807            (b'\xfe', FFFD),
1808            (b'\xfe\x80\x80', FFFD*3),
1809            # other sequences
1810            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1811            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1812            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1813            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1814             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1815        ]
1816        for n, (seq, res) in enumerate(sequences):
1817            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1818            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1819            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1820            self.assertEqual(seq.decode('utf-8', 'ignore'),
1821                             res.replace('\uFFFD', ''))
1822
1823    def assertCorrectUTF8Decoding(self, seq, res, err):
1824        """
1825        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1826        'strict' is used, returns res when 'replace' is used, and that doesn't
1827        return anything when 'ignore' is used.
1828        """
1829        with self.assertRaises(UnicodeDecodeError) as cm:
1830            seq.decode('utf-8')
1831        exc = cm.exception
1832
1833        self.assertIn(err, str(exc))
1834        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1835        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1836                         'aaaa' + res + 'bbbb')
1837        res = res.replace('\ufffd', '')
1838        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1839        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1840                          'aaaa' + res + 'bbbb')
1841
1842    def test_invalid_start_byte(self):
1843        """
1844        Test that an 'invalid start byte' error is raised when the first byte
1845        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1846        4-bytes sequence. The invalid start byte is replaced with a single
1847        U+FFFD when errors='replace'.
1848        E.g. <80> is a continuation byte and can appear only after a start byte.
1849        """
1850        FFFD = '\ufffd'
1851        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1852            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1853                                           'invalid start byte')
1854
1855    def test_unexpected_end_of_data(self):
1856        """
1857        Test that an 'unexpected end of data' error is raised when the string
1858        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1859        enough continuation bytes.  The incomplete sequence is replaced with a
1860        single U+FFFD when errors='replace'.
1861        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1862        sequence, but it's followed by only 2 valid continuation bytes and the
1863        last continuation bytes is missing.
1864        Note: the continuation bytes must be all valid, if one of them is
1865        invalid another error will be raised.
1866        """
1867        sequences = [
1868            'C2', 'DF',
1869            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1870            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1871            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1872            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1873            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1874            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1875        ]
1876        FFFD = '\ufffd'
1877        for seq in sequences:
1878            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1879                                           'unexpected end of data')
1880
1881    def test_invalid_cb_for_2bytes_seq(self):
1882        """
1883        Test that an 'invalid continuation byte' error is raised when the
1884        continuation byte of a 2-bytes sequence is invalid.  The start byte
1885        is replaced by a single U+FFFD and the second byte is handled
1886        separately when errors='replace'.
1887        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1888        sequence, but 41 is not a valid continuation byte because it's the
1889        ASCII letter 'A'.
1890        """
1891        FFFD = '\ufffd'
1892        FFFDx2 = FFFD * 2
1893        sequences = [
1894            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1895            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1896            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1897            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1898        ]
1899        for seq, res in sequences:
1900            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1901                                           'invalid continuation byte')
1902
1903    def test_invalid_cb_for_3bytes_seq(self):
1904        """
1905        Test that an 'invalid continuation byte' error is raised when the
1906        continuation byte(s) of a 3-bytes sequence are invalid.  When
1907        errors='replace', if the first continuation byte is valid, the first
1908        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1909        third byte is handled separately, otherwise only the start byte is
1910        replaced with a U+FFFD and the other continuation bytes are handled
1911        separately.
1912        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1913        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1914        because it's the ASCII letter 'A'.
1915        Note: when the start byte is E0 or ED, the valid ranges for the first
1916        continuation byte are limited to A0..BF and 80..9F respectively.
1917        Python 2 used to consider all the bytes in range 80..BF valid when the
1918        start byte was ED.  This is fixed in Python 3.
1919        """
1920        FFFD = '\ufffd'
1921        FFFDx2 = FFFD * 2
1922        sequences = [
1923            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1924            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1925            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1926            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1927            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1928            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1929            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1930            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1931            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1932            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1933            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1934            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1935            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1936            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1937            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1938            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1939            ('ED 7F', FFFD+'\x7f'),
1940            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1941            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1942            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1943            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1944            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1945            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1946            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1947            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1948            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1949            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1950            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1951            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1952            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1953            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1954            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1955            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1956        ]
1957        for seq, res in sequences:
1958            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1959                                           'invalid continuation byte')
1960
1961    def test_invalid_cb_for_4bytes_seq(self):
1962        """
1963        Test that an 'invalid continuation byte' error is raised when the
1964        continuation byte(s) of a 4-bytes sequence are invalid.  When
1965        errors='replace',the start byte and all the following valid
1966        continuation bytes are replaced with a single U+FFFD, and all the bytes
1967        starting from the first invalid continuation bytes (included) are
1968        handled separately.
1969        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1970        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1971        because it's the ASCII letter 'A'.
1972        Note: when the start byte is E0 or ED, the valid ranges for the first
1973        continuation byte are limited to A0..BF and 80..9F respectively.
1974        However, when the start byte is ED, Python 2 considers all the bytes
1975        in range 80..BF valid.  This is fixed in Python 3.
1976        """
1977        FFFD = '\ufffd'
1978        FFFDx2 = FFFD * 2
1979        sequences = [
1980            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1981            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1982            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1983            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1984            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1985            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1986            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1987            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1988            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1989            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1990            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1991            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1992            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1993            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1994            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1995            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1996            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1997            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1998            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1999            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2000            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2001            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2002            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2003            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2004            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2005            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2006            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2007            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2008            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2009            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2010            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2011            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2012            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2013            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2014            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2015            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2016            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2017            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2018            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2019            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2020            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2021            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2022            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2023            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2024            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2025            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2026            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2027            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2028            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2029            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2030            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2031            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2032            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2033            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2034            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2035        ]
2036        for seq, res in sequences:
2037            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2038                                           'invalid continuation byte')
2039
2040    def test_codecs_idna(self):
2041        # Test whether trailing dot is preserved
2042        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2043
2044    def test_codecs_errors(self):
2045        # Error handling (encoding)
2046        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2047        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2048        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2049        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2050        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2051                         'Andr\202 x'.encode('ascii', errors='replace'))
2052        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2053                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2054
2055        # Error handling (decoding)
2056        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2057        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2058        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2059        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2060        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2061
2062        # Error handling (unknown character names)
2063        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2064
2065        # Error handling (truncated escape sequence)
2066        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2067
2068        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2069        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2070        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2071        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2072
2073        # Error handling (wrong arguments)
2074        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2075
2076        # Error handling (lone surrogate in
2077        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2078        self.assertRaises(ValueError, int, "\ud800")
2079        self.assertRaises(ValueError, int, "\udf00")
2080        self.assertRaises(ValueError, float, "\ud800")
2081        self.assertRaises(ValueError, float, "\udf00")
2082        self.assertRaises(ValueError, complex, "\ud800")
2083        self.assertRaises(ValueError, complex, "\udf00")
2084
2085    def test_codecs(self):
2086        # Encoding
2087        self.assertEqual('hello'.encode('ascii'), b'hello')
2088        self.assertEqual('hello'.encode('utf-7'), b'hello')
2089        self.assertEqual('hello'.encode('utf-8'), b'hello')
2090        self.assertEqual('hello'.encode('utf-8'), b'hello')
2091        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2092        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2093        self.assertEqual('hello'.encode('latin-1'), b'hello')
2094
2095        # Default encoding is utf-8
2096        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2097
2098        # Roundtrip safety for BMP (just the first 1024 chars)
2099        for c in range(1024):
2100            u = chr(c)
2101            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2102                             'utf-16-be', 'raw_unicode_escape',
2103                             'unicode_escape', 'unicode_internal'):
2104                with warnings.catch_warnings():
2105                    # unicode-internal has been deprecated
2106                    warnings.simplefilter("ignore", DeprecationWarning)
2107
2108                    self.assertEqual(str(u.encode(encoding),encoding), u)
2109
2110        # Roundtrip safety for BMP (just the first 256 chars)
2111        for c in range(256):
2112            u = chr(c)
2113            for encoding in ('latin-1',):
2114                self.assertEqual(str(u.encode(encoding),encoding), u)
2115
2116        # Roundtrip safety for BMP (just the first 128 chars)
2117        for c in range(128):
2118            u = chr(c)
2119            for encoding in ('ascii',):
2120                self.assertEqual(str(u.encode(encoding),encoding), u)
2121
2122        # Roundtrip safety for non-BMP (just a few chars)
2123        with warnings.catch_warnings():
2124            # unicode-internal has been deprecated
2125            warnings.simplefilter("ignore", DeprecationWarning)
2126
2127            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2128            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2129                             'raw_unicode_escape',
2130                             'unicode_escape', 'unicode_internal'):
2131                self.assertEqual(str(u.encode(encoding),encoding), u)
2132
2133        # UTF-8 must be roundtrip safe for all code points
2134        # (except surrogates, which are forbidden).
2135        u = ''.join(map(chr, list(range(0, 0xd800)) +
2136                             list(range(0xe000, 0x110000))))
2137        for encoding in ('utf-8',):
2138            self.assertEqual(str(u.encode(encoding),encoding), u)
2139
2140    def test_codecs_charmap(self):
2141        # 0-127
2142        s = bytes(range(128))
2143        for encoding in (
2144            'cp037', 'cp1026', 'cp273',
2145            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2146            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2147            'cp863', 'cp865', 'cp866', 'cp1125',
2148            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2149            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2150            'iso8859_7', 'iso8859_9',
2151            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2152            'mac_cyrillic', 'mac_latin2',
2153
2154            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2155            'cp1256', 'cp1257', 'cp1258',
2156            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2157
2158            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2159            'cp1006', 'iso8859_8',
2160
2161            ### These have undefined mappings:
2162            #'cp424',
2163
2164            ### These fail the round-trip:
2165            #'cp875'
2166
2167            ):
2168            self.assertEqual(str(s, encoding).encode(encoding), s)
2169
2170        # 128-255
2171        s = bytes(range(128, 256))
2172        for encoding in (
2173            'cp037', 'cp1026', 'cp273',
2174            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2175            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2176            'cp863', 'cp865', 'cp866', 'cp1125',
2177            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2178            'iso8859_2', 'iso8859_4', 'iso8859_5',
2179            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2180            'mac_cyrillic', 'mac_latin2',
2181
2182            ### These have undefined mappings:
2183            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2184            #'cp1256', 'cp1257', 'cp1258',
2185            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2186            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2187            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2188
2189            ### These fail the round-trip:
2190            #'cp1006', 'cp875', 'iso8859_8',
2191
2192            ):
2193            self.assertEqual(str(s, encoding).encode(encoding), s)
2194
2195    def test_concatenation(self):
2196        self.assertEqual(("abc" "def"), "abcdef")
2197        self.assertEqual(("abc" "def"), "abcdef")
2198        self.assertEqual(("abc" "def"), "abcdef")
2199        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2200        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2201
2202    def test_printing(self):
2203        class BitBucket:
2204            def write(self, text):
2205                pass
2206
2207        out = BitBucket()
2208        print('abc', file=out)
2209        print('abc', 'def', file=out)
2210        print('abc', 'def', file=out)
2211        print('abc', 'def', file=out)
2212        print('abc\n', file=out)
2213        print('abc\n', end=' ', file=out)
2214        print('abc\n', end=' ', file=out)
2215        print('def\n', file=out)
2216        print('def\n', file=out)
2217
2218    def test_ucs4(self):
2219        x = '\U00100000'
2220        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2221        self.assertEqual(x, y)
2222
2223        y = br'\U00100000'
2224        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2225        self.assertEqual(x, y)
2226        y = br'\U00010000'
2227        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2228        self.assertEqual(x, y)
2229
2230        try:
2231            br'\U11111111'.decode("raw-unicode-escape")
2232        except UnicodeDecodeError as e:
2233            self.assertEqual(e.start, 0)
2234            self.assertEqual(e.end, 10)
2235        else:
2236            self.fail("Should have raised UnicodeDecodeError")
2237
2238    def test_conversion(self):
2239        # Make sure __str__() works properly
2240        class ObjectToStr:
2241            def __str__(self):
2242                return "foo"
2243
2244        class StrSubclassToStr(str):
2245            def __str__(self):
2246                return "foo"
2247
2248        class StrSubclassToStrSubclass(str):
2249            def __new__(cls, content=""):
2250                return str.__new__(cls, 2*content)
2251            def __str__(self):
2252                return self
2253
2254        self.assertEqual(str(ObjectToStr()), "foo")
2255        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2256        s = str(StrSubclassToStrSubclass("foo"))
2257        self.assertEqual(s, "foofoo")
2258        self.assertIs(type(s), StrSubclassToStrSubclass)
2259        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2260        self.assertEqual(s, "foofoo")
2261        self.assertIs(type(s), StrSubclass)
2262
2263    def test_unicode_repr(self):
2264        class s1:
2265            def __repr__(self):
2266                return '\\n'
2267
2268        class s2:
2269            def __repr__(self):
2270                return '\\n'
2271
2272        self.assertEqual(repr(s1()), '\\n')
2273        self.assertEqual(repr(s2()), '\\n')
2274
2275    def test_printable_repr(self):
2276        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2277        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2278
2279    # This test only affects 32-bit platforms because expandtabs can only take
2280    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2281    # to take a 64-bit long, this test should apply to all platforms.
2282    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2283                     'only applies to 32-bit platforms')
2284    def test_expandtabs_overflows_gracefully(self):
2285        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2286
2287    @support.cpython_only
2288    def test_expandtabs_optimization(self):
2289        s = 'abc'
2290        self.assertIs(s.expandtabs(), s)
2291
2292    def test_raiseMemError(self):
2293        if struct.calcsize('P') == 8:
2294            # 64 bits pointers
2295            ascii_struct_size = 48
2296            compact_struct_size = 72
2297        else:
2298            # 32 bits pointers
2299            ascii_struct_size = 24
2300            compact_struct_size = 36
2301
2302        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2303            code = ord(char)
2304            if code < 0x100:
2305                char_size = 1  # sizeof(Py_UCS1)
2306                struct_size = ascii_struct_size
2307            elif code < 0x10000:
2308                char_size = 2  # sizeof(Py_UCS2)
2309                struct_size = compact_struct_size
2310            else:
2311                char_size = 4  # sizeof(Py_UCS4)
2312                struct_size = compact_struct_size
2313            # Note: sys.maxsize is half of the actual max allocation because of
2314            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2315            # be allocatable, given enough memory.
2316            maxlen = ((sys.maxsize - struct_size) // char_size)
2317            alloc = lambda: char * maxlen
2318            self.assertRaises(MemoryError, alloc)
2319            self.assertRaises(MemoryError, alloc)
2320
2321    def test_format_subclass(self):
2322        class S(str):
2323            def __str__(self):
2324                return '__str__ overridden'
2325        s = S('xxx')
2326        self.assertEqual("%s" % s, '__str__ overridden')
2327        self.assertEqual("{}".format(s), '__str__ overridden')
2328
2329    def test_subclass_add(self):
2330        class S(str):
2331            def __add__(self, o):
2332                return "3"
2333        self.assertEqual(S("4") + S("5"), "3")
2334        class S(str):
2335            def __iadd__(self, o):
2336                return "3"
2337        s = S("1")
2338        s += "4"
2339        self.assertEqual(s, "3")
2340
2341    def test_getnewargs(self):
2342        text = 'abc'
2343        args = text.__getnewargs__()
2344        self.assertIsNot(args[0], text)
2345        self.assertEqual(args[0], text)
2346        self.assertEqual(len(args), 1)
2347
2348    def test_resize(self):
2349        for length in range(1, 100, 7):
2350            # generate a fresh string (refcount=1)
2351            text = 'a' * length + 'b'
2352
2353            with support.check_warnings(('unicode_internal codec has been '
2354                                         'deprecated', DeprecationWarning)):
2355                # fill wstr internal field
2356                abc = text.encode('unicode_internal')
2357                self.assertEqual(abc.decode('unicode_internal'), text)
2358
2359                # resize text: wstr field must be cleared and then recomputed
2360                text += 'c'
2361                abcdef = text.encode('unicode_internal')
2362                self.assertNotEqual(abc, abcdef)
2363                self.assertEqual(abcdef.decode('unicode_internal'), text)
2364
2365    def test_compare(self):
2366        # Issue #17615
2367        N = 10
2368        ascii = 'a' * N
2369        ascii2 = 'z' * N
2370        latin = '\x80' * N
2371        latin2 = '\xff' * N
2372        bmp = '\u0100' * N
2373        bmp2 = '\uffff' * N
2374        astral = '\U00100000' * N
2375        astral2 = '\U0010ffff' * N
2376        strings = (
2377            ascii, ascii2,
2378            latin, latin2,
2379            bmp, bmp2,
2380            astral, astral2)
2381        for text1, text2 in itertools.combinations(strings, 2):
2382            equal = (text1 is text2)
2383            self.assertEqual(text1 == text2, equal)
2384            self.assertEqual(text1 != text2, not equal)
2385
2386            if equal:
2387                self.assertTrue(text1 <= text2)
2388                self.assertTrue(text1 >= text2)
2389
2390                # text1 is text2: duplicate strings to skip the "str1 == str2"
2391                # optimization in unicode_compare_eq() and really compare
2392                # character per character
2393                copy1 = duplicate_string(text1)
2394                copy2 = duplicate_string(text2)
2395                self.assertIsNot(copy1, copy2)
2396
2397                self.assertTrue(copy1 == copy2)
2398                self.assertFalse(copy1 != copy2)
2399
2400                self.assertTrue(copy1 <= copy2)
2401                self.assertTrue(copy2 >= copy2)
2402
2403        self.assertTrue(ascii < ascii2)
2404        self.assertTrue(ascii < latin)
2405        self.assertTrue(ascii < bmp)
2406        self.assertTrue(ascii < astral)
2407        self.assertFalse(ascii >= ascii2)
2408        self.assertFalse(ascii >= latin)
2409        self.assertFalse(ascii >= bmp)
2410        self.assertFalse(ascii >= astral)
2411
2412        self.assertFalse(latin < ascii)
2413        self.assertTrue(latin < latin2)
2414        self.assertTrue(latin < bmp)
2415        self.assertTrue(latin < astral)
2416        self.assertTrue(latin >= ascii)
2417        self.assertFalse(latin >= latin2)
2418        self.assertFalse(latin >= bmp)
2419        self.assertFalse(latin >= astral)
2420
2421        self.assertFalse(bmp < ascii)
2422        self.assertFalse(bmp < latin)
2423        self.assertTrue(bmp < bmp2)
2424        self.assertTrue(bmp < astral)
2425        self.assertTrue(bmp >= ascii)
2426        self.assertTrue(bmp >= latin)
2427        self.assertFalse(bmp >= bmp2)
2428        self.assertFalse(bmp >= astral)
2429
2430        self.assertFalse(astral < ascii)
2431        self.assertFalse(astral < latin)
2432        self.assertFalse(astral < bmp2)
2433        self.assertTrue(astral < astral2)
2434        self.assertTrue(astral >= ascii)
2435        self.assertTrue(astral >= latin)
2436        self.assertTrue(astral >= bmp2)
2437        self.assertFalse(astral >= astral2)
2438
2439    def test_free_after_iterating(self):
2440        support.check_free_after_iterating(self, iter, str)
2441        support.check_free_after_iterating(self, reversed, str)
2442
2443
2444class CAPITest(unittest.TestCase):
2445
2446    # Test PyUnicode_FromFormat()
2447    def test_from_format(self):
2448        support.import_module('ctypes')
2449        from ctypes import (
2450            pythonapi, py_object, sizeof,
2451            c_int, c_long, c_longlong, c_ssize_t,
2452            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2453        name = "PyUnicode_FromFormat"
2454        _PyUnicode_FromFormat = getattr(pythonapi, name)
2455        _PyUnicode_FromFormat.restype = py_object
2456
2457        def PyUnicode_FromFormat(format, *args):
2458            cargs = tuple(
2459                py_object(arg) if isinstance(arg, str) else arg
2460                for arg in args)
2461            return _PyUnicode_FromFormat(format, *cargs)
2462
2463        def check_format(expected, format, *args):
2464            text = PyUnicode_FromFormat(format, *args)
2465            self.assertEqual(expected, text)
2466
2467        # ascii format, non-ascii argument
2468        check_format('ascii\x7f=unicode\xe9',
2469                     b'ascii\x7f=%U', 'unicode\xe9')
2470
2471        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2472        # raises an error
2473        self.assertRaisesRegex(ValueError,
2474            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2475            'string, got a non-ASCII byte: 0xe9$',
2476            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2477
2478        # test "%c"
2479        check_format('\uabcd',
2480                     b'%c', c_int(0xabcd))
2481        check_format('\U0010ffff',
2482                     b'%c', c_int(0x10ffff))
2483        with self.assertRaises(OverflowError):
2484            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2485        # Issue #18183
2486        check_format('\U00010000\U00100000',
2487                     b'%c%c', c_int(0x10000), c_int(0x100000))
2488
2489        # test "%"
2490        check_format('%',
2491                     b'%')
2492        check_format('%',
2493                     b'%%')
2494        check_format('%s',
2495                     b'%%s')
2496        check_format('[%]',
2497                     b'[%%]')
2498        check_format('%abc',
2499                     b'%%%s', b'abc')
2500
2501        # truncated string
2502        check_format('abc',
2503                     b'%.3s', b'abcdef')
2504        check_format('abc[\ufffd',
2505                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2506        check_format("'\\u20acABC'",
2507                     b'%A', '\u20acABC')
2508        check_format("'\\u20",
2509                     b'%.5A', '\u20acABCDEF')
2510        check_format("'\u20acABC'",
2511                     b'%R', '\u20acABC')
2512        check_format("'\u20acA",
2513                     b'%.3R', '\u20acABCDEF')
2514        check_format('\u20acAB',
2515                     b'%.3S', '\u20acABCDEF')
2516        check_format('\u20acAB',
2517                     b'%.3U', '\u20acABCDEF')
2518        check_format('\u20acAB',
2519                     b'%.3V', '\u20acABCDEF', None)
2520        check_format('abc[\ufffd',
2521                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2522
2523        # following tests comes from #7330
2524        # test width modifier and precision modifier with %S
2525        check_format("repr=  abc",
2526                     b'repr=%5S', 'abc')
2527        check_format("repr=ab",
2528                     b'repr=%.2S', 'abc')
2529        check_format("repr=   ab",
2530                     b'repr=%5.2S', 'abc')
2531
2532        # test width modifier and precision modifier with %R
2533        check_format("repr=   'abc'",
2534                     b'repr=%8R', 'abc')
2535        check_format("repr='ab",
2536                     b'repr=%.3R', 'abc')
2537        check_format("repr=  'ab",
2538                     b'repr=%5.3R', 'abc')
2539
2540        # test width modifier and precision modifier with %A
2541        check_format("repr=   'abc'",
2542                     b'repr=%8A', 'abc')
2543        check_format("repr='ab",
2544                     b'repr=%.3A', 'abc')
2545        check_format("repr=  'ab",
2546                     b'repr=%5.3A', 'abc')
2547
2548        # test width modifier and precision modifier with %s
2549        check_format("repr=  abc",
2550                     b'repr=%5s', b'abc')
2551        check_format("repr=ab",
2552                     b'repr=%.2s', b'abc')
2553        check_format("repr=   ab",
2554                     b'repr=%5.2s', b'abc')
2555
2556        # test width modifier and precision modifier with %U
2557        check_format("repr=  abc",
2558                     b'repr=%5U', 'abc')
2559        check_format("repr=ab",
2560                     b'repr=%.2U', 'abc')
2561        check_format("repr=   ab",
2562                     b'repr=%5.2U', 'abc')
2563
2564        # test width modifier and precision modifier with %V
2565        check_format("repr=  abc",
2566                     b'repr=%5V', 'abc', b'123')
2567        check_format("repr=ab",
2568                     b'repr=%.2V', 'abc', b'123')
2569        check_format("repr=   ab",
2570                     b'repr=%5.2V', 'abc', b'123')
2571        check_format("repr=  123",
2572                     b'repr=%5V', None, b'123')
2573        check_format("repr=12",
2574                     b'repr=%.2V', None, b'123')
2575        check_format("repr=   12",
2576                     b'repr=%5.2V', None, b'123')
2577
2578        # test integer formats (%i, %d, %u)
2579        check_format('010',
2580                     b'%03i', c_int(10))
2581        check_format('0010',
2582                     b'%0.4i', c_int(10))
2583        check_format('-123',
2584                     b'%i', c_int(-123))
2585        check_format('-123',
2586                     b'%li', c_long(-123))
2587        check_format('-123',
2588                     b'%lli', c_longlong(-123))
2589        check_format('-123',
2590                     b'%zi', c_ssize_t(-123))
2591
2592        check_format('-123',
2593                     b'%d', c_int(-123))
2594        check_format('-123',
2595                     b'%ld', c_long(-123))
2596        check_format('-123',
2597                     b'%lld', c_longlong(-123))
2598        check_format('-123',
2599                     b'%zd', c_ssize_t(-123))
2600
2601        check_format('123',
2602                     b'%u', c_uint(123))
2603        check_format('123',
2604                     b'%lu', c_ulong(123))
2605        check_format('123',
2606                     b'%llu', c_ulonglong(123))
2607        check_format('123',
2608                     b'%zu', c_size_t(123))
2609
2610        # test long output
2611        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2612        max_longlong = -min_longlong - 1
2613        check_format(str(min_longlong),
2614                     b'%lld', c_longlong(min_longlong))
2615        check_format(str(max_longlong),
2616                     b'%lld', c_longlong(max_longlong))
2617        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2618        check_format(str(max_ulonglong),
2619                     b'%llu', c_ulonglong(max_ulonglong))
2620        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2621
2622        # test padding (width and/or precision)
2623        check_format('123'.rjust(10, '0'),
2624                     b'%010i', c_int(123))
2625        check_format('123'.rjust(100),
2626                     b'%100i', c_int(123))
2627        check_format('123'.rjust(100, '0'),
2628                     b'%.100i', c_int(123))
2629        check_format('123'.rjust(80, '0').rjust(100),
2630                     b'%100.80i', c_int(123))
2631
2632        check_format('123'.rjust(10, '0'),
2633                     b'%010u', c_uint(123))
2634        check_format('123'.rjust(100),
2635                     b'%100u', c_uint(123))
2636        check_format('123'.rjust(100, '0'),
2637                     b'%.100u', c_uint(123))
2638        check_format('123'.rjust(80, '0').rjust(100),
2639                     b'%100.80u', c_uint(123))
2640
2641        check_format('123'.rjust(10, '0'),
2642                     b'%010x', c_int(0x123))
2643        check_format('123'.rjust(100),
2644                     b'%100x', c_int(0x123))
2645        check_format('123'.rjust(100, '0'),
2646                     b'%.100x', c_int(0x123))
2647        check_format('123'.rjust(80, '0').rjust(100),
2648                     b'%100.80x', c_int(0x123))
2649
2650        # test %A
2651        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2652                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2653
2654        # test %V
2655        check_format('repr=abc',
2656                     b'repr=%V', 'abc', b'xyz')
2657
2658        # Test string decode from parameter of %s using utf-8.
2659        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2660        # '\u4eba\u6c11'
2661        check_format('repr=\u4eba\u6c11',
2662                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2663
2664        #Test replace error handler.
2665        check_format('repr=abc\ufffd',
2666                     b'repr=%V', None, b'abc\xff')
2667
2668        # not supported: copy the raw format string. these tests are just here
2669        # to check for crashes and should not be considered as specifications
2670        check_format('%s',
2671                     b'%1%s', b'abc')
2672        check_format('%1abc',
2673                     b'%1abc')
2674        check_format('%+i',
2675                     b'%+i', c_int(10))
2676        check_format('%.%s',
2677                     b'%.%s', b'abc')
2678
2679        # Issue #33817: empty strings
2680        check_format('',
2681                     b'')
2682        check_format('',
2683                     b'%s', b'')
2684
2685    # Test PyUnicode_AsWideChar()
2686    @support.cpython_only
2687    def test_aswidechar(self):
2688        from _testcapi import unicode_aswidechar
2689        support.import_module('ctypes')
2690        from ctypes import c_wchar, sizeof
2691
2692        wchar, size = unicode_aswidechar('abcdef', 2)
2693        self.assertEqual(size, 2)
2694        self.assertEqual(wchar, 'ab')
2695
2696        wchar, size = unicode_aswidechar('abc', 3)
2697        self.assertEqual(size, 3)
2698        self.assertEqual(wchar, 'abc')
2699
2700        wchar, size = unicode_aswidechar('abc', 4)
2701        self.assertEqual(size, 3)
2702        self.assertEqual(wchar, 'abc\0')
2703
2704        wchar, size = unicode_aswidechar('abc', 10)
2705        self.assertEqual(size, 3)
2706        self.assertEqual(wchar, 'abc\0')
2707
2708        wchar, size = unicode_aswidechar('abc\0def', 20)
2709        self.assertEqual(size, 7)
2710        self.assertEqual(wchar, 'abc\0def\0')
2711
2712        nonbmp = chr(0x10ffff)
2713        if sizeof(c_wchar) == 2:
2714            buflen = 3
2715            nchar = 2
2716        else: # sizeof(c_wchar) == 4
2717            buflen = 2
2718            nchar = 1
2719        wchar, size = unicode_aswidechar(nonbmp, buflen)
2720        self.assertEqual(size, nchar)
2721        self.assertEqual(wchar, nonbmp + '\0')
2722
2723    # Test PyUnicode_AsWideCharString()
2724    @support.cpython_only
2725    def test_aswidecharstring(self):
2726        from _testcapi import unicode_aswidecharstring
2727        support.import_module('ctypes')
2728        from ctypes import c_wchar, sizeof
2729
2730        wchar, size = unicode_aswidecharstring('abc')
2731        self.assertEqual(size, 3)
2732        self.assertEqual(wchar, 'abc\0')
2733
2734        wchar, size = unicode_aswidecharstring('abc\0def')
2735        self.assertEqual(size, 7)
2736        self.assertEqual(wchar, 'abc\0def\0')
2737
2738        nonbmp = chr(0x10ffff)
2739        if sizeof(c_wchar) == 2:
2740            nchar = 2
2741        else: # sizeof(c_wchar) == 4
2742            nchar = 1
2743        wchar, size = unicode_aswidecharstring(nonbmp)
2744        self.assertEqual(size, nchar)
2745        self.assertEqual(wchar, nonbmp + '\0')
2746
2747    # Test PyUnicode_AsUCS4()
2748    @support.cpython_only
2749    def test_asucs4(self):
2750        from _testcapi import unicode_asucs4
2751        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2752                  'a\ud800b\udfffc', '\ud834\udd1e']:
2753            l = len(s)
2754            self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2755            self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2756            self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2757            self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2758            self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2759            self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2760            s = '\0'.join([s, s])
2761            self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2762            self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2763
2764    # Test PyUnicode_FindChar()
2765    @support.cpython_only
2766    def test_findchar(self):
2767        from _testcapi import unicode_findchar
2768
2769        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2770            for i, ch in enumerate(str):
2771                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2772                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2773
2774        str = "!>_<!"
2775        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2776        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2777        # start < end
2778        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2779        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2780        # start >= end
2781        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2782        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2783        # negative
2784        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2785        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2786
2787    # Test PyUnicode_CopyCharacters()
2788    @support.cpython_only
2789    def test_copycharacters(self):
2790        from _testcapi import unicode_copycharacters
2791
2792        strings = [
2793            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2794            '\u4f60\u597d\u4e16\u754c\uff01',
2795            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2796        ]
2797
2798        for idx, from_ in enumerate(strings):
2799            # wide -> narrow: exceed maxchar limitation
2800            for to in strings[:idx]:
2801                self.assertRaises(
2802                    SystemError,
2803                    unicode_copycharacters, to, 0, from_, 0, 5
2804                )
2805            # same kind
2806            for from_start in range(5):
2807                self.assertEqual(
2808                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2809                    (from_[from_start:from_start+5].ljust(5, '\0'),
2810                     5-from_start)
2811                )
2812            for to_start in range(5):
2813                self.assertEqual(
2814                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2815                    (from_[to_start:to_start+5].rjust(5, '\0'),
2816                     5-to_start)
2817                )
2818            # narrow -> wide
2819            # Tests omitted since this creates invalid strings.
2820
2821        s = strings[0]
2822        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2823        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2824        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2825        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2826        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2827        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2828        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2829
2830    @support.cpython_only
2831    def test_encode_decimal(self):
2832        from _testcapi import unicode_encodedecimal
2833        self.assertEqual(unicode_encodedecimal('123'),
2834                         b'123')
2835        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2836                         b'3.14')
2837        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2838                         b' 3.14 ')
2839        self.assertRaises(UnicodeEncodeError,
2840                          unicode_encodedecimal, "123\u20ac", "strict")
2841        self.assertRaisesRegex(
2842            ValueError,
2843            "^'decimal' codec can't encode character",
2844            unicode_encodedecimal, "123\u20ac", "replace")
2845
2846    @support.cpython_only
2847    def test_transform_decimal(self):
2848        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2849        self.assertEqual(transform_decimal('123'),
2850                         '123')
2851        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2852                         '3.14')
2853        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2854                         "\N{EM SPACE}3.14\N{EN SPACE}")
2855        self.assertEqual(transform_decimal('123\u20ac'),
2856                         '123\u20ac')
2857
2858    @support.cpython_only
2859    def test_pep393_utf8_caching_bug(self):
2860        # Issue #25709: Problem with string concatenation and utf-8 cache
2861        from _testcapi import getargs_s_hash
2862        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2863            s = ''
2864            for i in range(5):
2865                # Due to CPython specific optimization the 's' string can be
2866                # resized in-place.
2867                s += chr(k)
2868                # Parsing with the "s#" format code calls indirectly
2869                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2870                # encoded string cached in the Unicode object.
2871                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2872                # Check that the second call returns the same result
2873                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2874
2875class StringModuleTest(unittest.TestCase):
2876    def test_formatter_parser(self):
2877        def parse(format):
2878            return list(_string.formatter_parser(format))
2879
2880        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2881        self.assertEqual(formatter, [
2882            ('prefix ', '2', '', 's'),
2883            ('xxx', '0', '^+10.3f', None),
2884            ('', 'obj.attr', '', 's'),
2885            (' ', 'z[0]', '10', 's'),
2886        ])
2887
2888        formatter = parse("prefix {} suffix")
2889        self.assertEqual(formatter, [
2890            ('prefix ', '', '', None),
2891            (' suffix', None, None, None),
2892        ])
2893
2894        formatter = parse("str")
2895        self.assertEqual(formatter, [
2896            ('str', None, None, None),
2897        ])
2898
2899        formatter = parse("")
2900        self.assertEqual(formatter, [])
2901
2902        formatter = parse("{0}")
2903        self.assertEqual(formatter, [
2904            ('', '0', '', None),
2905        ])
2906
2907        self.assertRaises(TypeError, _string.formatter_parser, 1)
2908
2909    def test_formatter_field_name_split(self):
2910        def split(name):
2911            items = list(_string.formatter_field_name_split(name))
2912            items[1] = list(items[1])
2913            return items
2914        self.assertEqual(split("obj"), ["obj", []])
2915        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2916        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2917        self.assertEqual(split("obj.arg[key1][key2]"), [
2918            "obj",
2919            [(True, 'arg'),
2920             (False, 'key1'),
2921             (False, 'key2'),
2922            ]])
2923        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2924
2925
2926if __name__ == "__main__":
2927    unittest.main()
2928