• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8import _string
9import codecs
10import itertools
11import operator
12import struct
13import sys
14import unicodedata
15import unittest
16import warnings
17from test import support, string_tests
18
19# Error handling (bad decoder return)
20def search_function(encoding):
21    def decode1(input, errors="strict"):
22        return 42 # not a tuple
23    def encode1(input, errors="strict"):
24        return 42 # not a tuple
25    def encode2(input, errors="strict"):
26        return (42, 42) # no unicode
27    def decode2(input, errors="strict"):
28        return (42, 42) # no unicode
29    if encoding=="test.unicode1":
30        return (encode1, decode1, None, None)
31    elif encoding=="test.unicode2":
32        return (encode2, decode2, None, None)
33    else:
34        return None
35codecs.register(search_function)
36
37def duplicate_string(text):
38    """
39    Try to get a fresh clone of the specified text:
40    new object with a reference count of 1.
41
42    This is a best-effort: latin1 single letters and the empty
43    string ('') are singletons and cannot be cloned.
44    """
45    return text.encode().decode()
46
47class StrSubclass(str):
48    pass
49
50class UnicodeTest(string_tests.CommonTest,
51        string_tests.MixinStrUnicodeUserStringTest,
52        string_tests.MixinStrUnicodeTest,
53        unittest.TestCase):
54
55    type2test = str
56
57    def checkequalnofix(self, result, object, methodname, *args):
58        method = getattr(object, methodname)
59        realresult = method(*args)
60        self.assertEqual(realresult, result)
61        self.assertTrue(type(realresult) is type(result))
62
63        # if the original is returned make sure that
64        # this doesn't happen with subclasses
65        if realresult is object:
66            class usub(str):
67                def __repr__(self):
68                    return 'usub(%r)' % str.__repr__(self)
69            object = usub(object)
70            method = getattr(object, methodname)
71            realresult = method(*args)
72            self.assertEqual(realresult, result)
73            self.assertTrue(object is not realresult)
74
75    def test_literals(self):
76        self.assertEqual('\xff', '\u00ff')
77        self.assertEqual('\uffff', '\U0000ffff')
78        self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
79        self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
80        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
81        # raw strings should not have unicode escapes
82        self.assertNotEqual(r"\u0020", " ")
83
84    def test_ascii(self):
85        if not sys.platform.startswith('java'):
86            # Test basic sanity of repr()
87            self.assertEqual(ascii('abc'), "'abc'")
88            self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
89            self.assertEqual(ascii('ab\\'), "'ab\\\\'")
90            self.assertEqual(ascii('\\c'), "'\\\\c'")
91            self.assertEqual(ascii('\\'), "'\\\\'")
92            self.assertEqual(ascii('\n'), "'\\n'")
93            self.assertEqual(ascii('\r'), "'\\r'")
94            self.assertEqual(ascii('\t'), "'\\t'")
95            self.assertEqual(ascii('\b'), "'\\x08'")
96            self.assertEqual(ascii("'\""), """'\\'"'""")
97            self.assertEqual(ascii("'\""), """'\\'"'""")
98            self.assertEqual(ascii("'"), '''"'"''')
99            self.assertEqual(ascii('"'), """'"'""")
100            latin1repr = (
101                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
102                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
103                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
104                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
105                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
106                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
107                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
108                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
109                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
110                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
111                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
112                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
113                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
114                "\\xfe\\xff'")
115            testrepr = ascii(''.join(map(chr, range(256))))
116            self.assertEqual(testrepr, latin1repr)
117            # Test ascii works on wide unicode escapes without overflow.
118            self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
119                             ascii("\U00010000" * 39 + "\uffff" * 4096))
120
121            class WrongRepr:
122                def __repr__(self):
123                    return b'byte-repr'
124            self.assertRaises(TypeError, ascii, WrongRepr())
125
126    def test_repr(self):
127        if not sys.platform.startswith('java'):
128            # Test basic sanity of repr()
129            self.assertEqual(repr('abc'), "'abc'")
130            self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
131            self.assertEqual(repr('ab\\'), "'ab\\\\'")
132            self.assertEqual(repr('\\c'), "'\\\\c'")
133            self.assertEqual(repr('\\'), "'\\\\'")
134            self.assertEqual(repr('\n'), "'\\n'")
135            self.assertEqual(repr('\r'), "'\\r'")
136            self.assertEqual(repr('\t'), "'\\t'")
137            self.assertEqual(repr('\b'), "'\\x08'")
138            self.assertEqual(repr("'\""), """'\\'"'""")
139            self.assertEqual(repr("'\""), """'\\'"'""")
140            self.assertEqual(repr("'"), '''"'"''')
141            self.assertEqual(repr('"'), """'"'""")
142            latin1repr = (
143                "'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
144                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
145                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
146                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
147                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
148                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
149                "\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
150                "\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
151                "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
152                "\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
153                "\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
154                "\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
155                "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
156                "\xfe\xff'")
157            testrepr = repr(''.join(map(chr, range(256))))
158            self.assertEqual(testrepr, latin1repr)
159            # Test repr works on wide unicode escapes without overflow.
160            self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
161                             repr("\U00010000" * 39 + "\uffff" * 4096))
162
163            class WrongRepr:
164                def __repr__(self):
165                    return b'byte-repr'
166            self.assertRaises(TypeError, repr, WrongRepr())
167
168    def test_iterators(self):
169        # Make sure unicode objects have an __iter__ method
170        it = "\u1111\u2222\u3333".__iter__()
171        self.assertEqual(next(it), "\u1111")
172        self.assertEqual(next(it), "\u2222")
173        self.assertEqual(next(it), "\u3333")
174        self.assertRaises(StopIteration, next, it)
175
176    def test_count(self):
177        string_tests.CommonTest.test_count(self)
178        # check mixed argument types
179        self.checkequalnofix(3,  'aaa', 'count', 'a')
180        self.checkequalnofix(0,  'aaa', 'count', 'b')
181        self.checkequalnofix(3, 'aaa', 'count',  'a')
182        self.checkequalnofix(0, 'aaa', 'count',  'b')
183        self.checkequalnofix(0, 'aaa', 'count',  'b')
184        self.checkequalnofix(1, 'aaa', 'count',  'a', -1)
185        self.checkequalnofix(3, 'aaa', 'count',  'a', -10)
186        self.checkequalnofix(2, 'aaa', 'count',  'a', 0, -1)
187        self.checkequalnofix(0, 'aaa', 'count',  'a', 0, -10)
188        # test mixed kinds
189        self.checkequal(10, '\u0102' + 'a' * 10, 'count', 'a')
190        self.checkequal(10, '\U00100304' + 'a' * 10, 'count', 'a')
191        self.checkequal(10, '\U00100304' + '\u0102' * 10, 'count', '\u0102')
192        self.checkequal(0, 'a' * 10, 'count', '\u0102')
193        self.checkequal(0, 'a' * 10, 'count', '\U00100304')
194        self.checkequal(0, '\u0102' * 10, 'count', '\U00100304')
195        self.checkequal(10, '\u0102' + 'a_' * 10, 'count', 'a_')
196        self.checkequal(10, '\U00100304' + 'a_' * 10, 'count', 'a_')
197        self.checkequal(10, '\U00100304' + '\u0102_' * 10, 'count', '\u0102_')
198        self.checkequal(0, 'a' * 10, 'count', 'a\u0102')
199        self.checkequal(0, 'a' * 10, 'count', 'a\U00100304')
200        self.checkequal(0, '\u0102' * 10, 'count', '\u0102\U00100304')
201
202    def test_find(self):
203        string_tests.CommonTest.test_find(self)
204        # test implementation details of the memchr fast path
205        self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
206        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
207        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
208        self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
209        self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
210        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
211        self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
212        # check mixed argument types
213        self.checkequalnofix(0,  'abcdefghiabc', 'find', 'abc')
214        self.checkequalnofix(9,  'abcdefghiabc', 'find', 'abc', 1)
215        self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
216
217        self.assertRaises(TypeError, 'hello'.find)
218        self.assertRaises(TypeError, 'hello'.find, 42)
219        # test mixed kinds
220        self.checkequal(100, '\u0102' * 100 + 'a', 'find', 'a')
221        self.checkequal(100, '\U00100304' * 100 + 'a', 'find', 'a')
222        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'find', '\u0102')
223        self.checkequal(-1, 'a' * 100, 'find', '\u0102')
224        self.checkequal(-1, 'a' * 100, 'find', '\U00100304')
225        self.checkequal(-1, '\u0102' * 100, 'find', '\U00100304')
226        self.checkequal(100, '\u0102' * 100 + 'a_', 'find', 'a_')
227        self.checkequal(100, '\U00100304' * 100 + 'a_', 'find', 'a_')
228        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'find', '\u0102_')
229        self.checkequal(-1, 'a' * 100, 'find', 'a\u0102')
230        self.checkequal(-1, 'a' * 100, 'find', 'a\U00100304')
231        self.checkequal(-1, '\u0102' * 100, 'find', '\u0102\U00100304')
232
233    def test_rfind(self):
234        string_tests.CommonTest.test_rfind(self)
235        # test implementation details of the memrchr fast path
236        self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
237        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
238        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
239        self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
240        self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
241        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
242        self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
243        # check mixed argument types
244        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', 'abc')
245        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', '')
246        self.checkequalnofix(12, 'abcdefghiabc', 'rfind',  '')
247        # test mixed kinds
248        self.checkequal(0, 'a' + '\u0102' * 100, 'rfind', 'a')
249        self.checkequal(0, 'a' + '\U00100304' * 100, 'rfind', 'a')
250        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rfind', '\u0102')
251        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102')
252        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304')
253        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304')
254        self.checkequal(0, '_a' + '\u0102' * 100, 'rfind', '_a')
255        self.checkequal(0, '_a' + '\U00100304' * 100, 'rfind', '_a')
256        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rfind', '_\u0102')
257        self.checkequal(-1, 'a' * 100, 'rfind', '\u0102a')
258        self.checkequal(-1, 'a' * 100, 'rfind', '\U00100304a')
259        self.checkequal(-1, '\u0102' * 100, 'rfind', '\U00100304\u0102')
260
261    def test_index(self):
262        string_tests.CommonTest.test_index(self)
263        self.checkequalnofix(0, 'abcdefghiabc', 'index',  '')
264        self.checkequalnofix(3, 'abcdefghiabc', 'index',  'def')
265        self.checkequalnofix(0, 'abcdefghiabc', 'index',  'abc')
266        self.checkequalnofix(9, 'abcdefghiabc', 'index',  'abc', 1)
267        self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
268        self.assertRaises(ValueError, 'abcdefghiab'.index,  'abc', 1)
269        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', 8)
270        self.assertRaises(ValueError, 'abcdefghi'.index,  'ghi', -1)
271        # test mixed kinds
272        self.checkequal(100, '\u0102' * 100 + 'a', 'index', 'a')
273        self.checkequal(100, '\U00100304' * 100 + 'a', 'index', 'a')
274        self.checkequal(100, '\U00100304' * 100 + '\u0102', 'index', '\u0102')
275        self.assertRaises(ValueError, ('a' * 100).index, '\u0102')
276        self.assertRaises(ValueError, ('a' * 100).index, '\U00100304')
277        self.assertRaises(ValueError, ('\u0102' * 100).index, '\U00100304')
278        self.checkequal(100, '\u0102' * 100 + 'a_', 'index', 'a_')
279        self.checkequal(100, '\U00100304' * 100 + 'a_', 'index', 'a_')
280        self.checkequal(100, '\U00100304' * 100 + '\u0102_', 'index', '\u0102_')
281        self.assertRaises(ValueError, ('a' * 100).index, 'a\u0102')
282        self.assertRaises(ValueError, ('a' * 100).index, 'a\U00100304')
283        self.assertRaises(ValueError, ('\u0102' * 100).index, '\u0102\U00100304')
284
285    def test_rindex(self):
286        string_tests.CommonTest.test_rindex(self)
287        self.checkequalnofix(12, 'abcdefghiabc', 'rindex',  '')
288        self.checkequalnofix(3,  'abcdefghiabc', 'rindex',  'def')
289        self.checkequalnofix(9,  'abcdefghiabc', 'rindex',  'abc')
290        self.checkequalnofix(0,  'abcdefghiabc', 'rindex',  'abc', 0, -1)
291
292        self.assertRaises(ValueError, 'abcdefghiabc'.rindex,  'hib')
293        self.assertRaises(ValueError, 'defghiabc'.rindex,  'def', 1)
294        self.assertRaises(ValueError, 'defghiabc'.rindex,  'abc', 0, -1)
295        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, 8)
296        self.assertRaises(ValueError, 'abcdefghi'.rindex,  'ghi', 0, -1)
297        # test mixed kinds
298        self.checkequal(0, 'a' + '\u0102' * 100, 'rindex', 'a')
299        self.checkequal(0, 'a' + '\U00100304' * 100, 'rindex', 'a')
300        self.checkequal(0, '\u0102' + '\U00100304' * 100, 'rindex', '\u0102')
301        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102')
302        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304')
303        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304')
304        self.checkequal(0, '_a' + '\u0102' * 100, 'rindex', '_a')
305        self.checkequal(0, '_a' + '\U00100304' * 100, 'rindex', '_a')
306        self.checkequal(0, '_\u0102' + '\U00100304' * 100, 'rindex', '_\u0102')
307        self.assertRaises(ValueError, ('a' * 100).rindex, '\u0102a')
308        self.assertRaises(ValueError, ('a' * 100).rindex, '\U00100304a')
309        self.assertRaises(ValueError, ('\u0102' * 100).rindex, '\U00100304\u0102')
310
311    def test_maketrans_translate(self):
312        # these work with plain translate()
313        self.checkequalnofix('bbbc', 'abababc', 'translate',
314                             {ord('a'): None})
315        self.checkequalnofix('iiic', 'abababc', 'translate',
316                             {ord('a'): None, ord('b'): ord('i')})
317        self.checkequalnofix('iiix', 'abababc', 'translate',
318                             {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
319        self.checkequalnofix('c', 'abababc', 'translate',
320                             {ord('a'): None, ord('b'): ''})
321        self.checkequalnofix('xyyx', 'xzx', 'translate',
322                             {ord('z'): 'yy'})
323
324        # this needs maketrans()
325        self.checkequalnofix('abababc', 'abababc', 'translate',
326                             {'b': '<i>'})
327        tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
328        self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
329        # test alternative way of calling maketrans()
330        tbl = self.type2test.maketrans('abc', 'xyz', 'd')
331        self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
332
333        # various tests switching from ASCII to latin1 or the opposite;
334        # same length, remove a letter, or replace with a longer string.
335        self.assertEqual("[a]".translate(str.maketrans('a', 'X')),
336                         "[X]")
337        self.assertEqual("[a]".translate(str.maketrans({'a': 'X'})),
338                         "[X]")
339        self.assertEqual("[a]".translate(str.maketrans({'a': None})),
340                         "[]")
341        self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
342                         "[XXX]")
343        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
344                         "[\xe9]")
345        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
346                         "x123")
347        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
348                         "x\xe9")
349
350        # test non-ASCII (don't take the fast-path)
351        self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
352                         "[<\xe9>]")
353        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': 'a'})),
354                         "[a]")
355        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': None})),
356                         "[]")
357        self.assertEqual("[\xe9]".translate(str.maketrans({'\xe9': '123'})),
358                         "[123]")
359        self.assertEqual("[a\xe9]".translate(str.maketrans({'a': '<\u20ac>'})),
360                         "[<\u20ac>\xe9]")
361
362        # invalid Unicode characters
363        invalid_char = 0x10ffff+1
364        for before in "a\xe9\u20ac\U0010ffff":
365            mapping = str.maketrans({before: invalid_char})
366            text = "[%s]" % before
367            self.assertRaises(ValueError, text.translate, mapping)
368
369        # errors
370        self.assertRaises(TypeError, self.type2test.maketrans)
371        self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
372        self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
373        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
374        self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
375        self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
376        self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
377
378        self.assertRaises(TypeError, 'hello'.translate)
379        self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
380
381    def test_split(self):
382        string_tests.CommonTest.test_split(self)
383
384        # test mixed kinds
385        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
386            left *= 9
387            right *= 9
388            for delim in ('c', '\u0102', '\U00010302'):
389                self.checkequal([left + right],
390                                left + right, 'split', delim)
391                self.checkequal([left, right],
392                                left + delim + right, 'split', delim)
393                self.checkequal([left + right],
394                                left + right, 'split', delim * 2)
395                self.checkequal([left, right],
396                                left + delim * 2 + right, 'split', delim *2)
397
398    def test_rsplit(self):
399        string_tests.CommonTest.test_rsplit(self)
400        # test mixed kinds
401        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
402            left *= 9
403            right *= 9
404            for delim in ('c', '\u0102', '\U00010302'):
405                self.checkequal([left + right],
406                                left + right, 'rsplit', delim)
407                self.checkequal([left, right],
408                                left + delim + right, 'rsplit', delim)
409                self.checkequal([left + right],
410                                left + right, 'rsplit', delim * 2)
411                self.checkequal([left, right],
412                                left + delim * 2 + right, 'rsplit', delim *2)
413
414    def test_partition(self):
415        string_tests.MixinStrUnicodeUserStringTest.test_partition(self)
416        # test mixed kinds
417        self.checkequal(('ABCDEFGH', '', ''), 'ABCDEFGH', 'partition', '\u4200')
418        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
419            left *= 9
420            right *= 9
421            for delim in ('c', '\u0102', '\U00010302'):
422                self.checkequal((left + right, '', ''),
423                                left + right, 'partition', delim)
424                self.checkequal((left, delim, right),
425                                left + delim + right, 'partition', delim)
426                self.checkequal((left + right, '', ''),
427                                left + right, 'partition', delim * 2)
428                self.checkequal((left, delim * 2, right),
429                                left + delim * 2 + right, 'partition', delim * 2)
430
431    def test_rpartition(self):
432        string_tests.MixinStrUnicodeUserStringTest.test_rpartition(self)
433        # test mixed kinds
434        self.checkequal(('', '', 'ABCDEFGH'), 'ABCDEFGH', 'rpartition', '\u4200')
435        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
436            left *= 9
437            right *= 9
438            for delim in ('c', '\u0102', '\U00010302'):
439                self.checkequal(('', '', left + right),
440                                left + right, 'rpartition', delim)
441                self.checkequal((left, delim, right),
442                                left + delim + right, 'rpartition', delim)
443                self.checkequal(('', '', left + right),
444                                left + right, 'rpartition', delim * 2)
445                self.checkequal((left, delim * 2, right),
446                                left + delim * 2 + right, 'rpartition', delim * 2)
447
448    def test_join(self):
449        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
450
451        class MyWrapper:
452            def __init__(self, sval): self.sval = sval
453            def __str__(self): return self.sval
454
455        # mixed arguments
456        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
457        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
458        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
459        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
460        self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
461        self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
462        self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
463        self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
464        self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
465        self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
466        self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
467
468    @unittest.skipIf(sys.maxsize > 2**32,
469        'needs too much memory on a 64-bit platform')
470    def test_join_overflow(self):
471        size = int(sys.maxsize**0.5) + 1
472        seq = ('A' * size,) * size
473        self.assertRaises(OverflowError, ''.join, seq)
474
475    def test_replace(self):
476        string_tests.CommonTest.test_replace(self)
477
478        # method call forwarded from str implementation because of unicode argument
479        self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
480        self.assertRaises(TypeError, 'replace'.replace, "r", 42)
481        # test mixed kinds
482        for left, right in ('ba', '\u0101\u0100', '\U00010301\U00010300'):
483            left *= 9
484            right *= 9
485            for delim in ('c', '\u0102', '\U00010302'):
486                for repl in ('d', '\u0103', '\U00010303'):
487                    self.checkequal(left + right,
488                                    left + right, 'replace', delim, repl)
489                    self.checkequal(left + repl + right,
490                                    left + delim + right,
491                                    'replace', delim, repl)
492                    self.checkequal(left + right,
493                                    left + right, 'replace', delim * 2, repl)
494                    self.checkequal(left + repl + right,
495                                    left + delim * 2 + right,
496                                    'replace', delim * 2, repl)
497
498    @support.cpython_only
499    def test_replace_id(self):
500        pattern = 'abc'
501        text = 'abc def'
502        self.assertIs(text.replace(pattern, pattern), text)
503
504    def test_bytes_comparison(self):
505        with support.check_warnings():
506            warnings.simplefilter('ignore', BytesWarning)
507            self.assertEqual('abc' == b'abc', False)
508            self.assertEqual('abc' != b'abc', True)
509            self.assertEqual('abc' == bytearray(b'abc'), False)
510            self.assertEqual('abc' != bytearray(b'abc'), True)
511
512    def test_comparison(self):
513        # Comparisons:
514        self.assertEqual('abc', 'abc')
515        self.assertTrue('abcd' > 'abc')
516        self.assertTrue('abc' < 'abcd')
517
518        if 0:
519            # Move these tests to a Unicode collation module test...
520            # Testing UTF-16 code point order comparisons...
521
522            # No surrogates, no fixup required.
523            self.assertTrue('\u0061' < '\u20ac')
524            # Non surrogate below surrogate value, no fixup required
525            self.assertTrue('\u0061' < '\ud800\udc02')
526
527            # Non surrogate above surrogate value, fixup required
528            def test_lecmp(s, s2):
529                self.assertTrue(s < s2)
530
531            def test_fixup(s):
532                s2 = '\ud800\udc01'
533                test_lecmp(s, s2)
534                s2 = '\ud900\udc01'
535                test_lecmp(s, s2)
536                s2 = '\uda00\udc01'
537                test_lecmp(s, s2)
538                s2 = '\udb00\udc01'
539                test_lecmp(s, s2)
540                s2 = '\ud800\udd01'
541                test_lecmp(s, s2)
542                s2 = '\ud900\udd01'
543                test_lecmp(s, s2)
544                s2 = '\uda00\udd01'
545                test_lecmp(s, s2)
546                s2 = '\udb00\udd01'
547                test_lecmp(s, s2)
548                s2 = '\ud800\ude01'
549                test_lecmp(s, s2)
550                s2 = '\ud900\ude01'
551                test_lecmp(s, s2)
552                s2 = '\uda00\ude01'
553                test_lecmp(s, s2)
554                s2 = '\udb00\ude01'
555                test_lecmp(s, s2)
556                s2 = '\ud800\udfff'
557                test_lecmp(s, s2)
558                s2 = '\ud900\udfff'
559                test_lecmp(s, s2)
560                s2 = '\uda00\udfff'
561                test_lecmp(s, s2)
562                s2 = '\udb00\udfff'
563                test_lecmp(s, s2)
564
565                test_fixup('\ue000')
566                test_fixup('\uff61')
567
568        # Surrogates on both sides, no fixup required
569        self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
570
571    def test_islower(self):
572        super().test_islower()
573        self.checkequalnofix(False, '\u1FFc', 'islower')
574        self.assertFalse('\u2167'.islower())
575        self.assertTrue('\u2177'.islower())
576        # non-BMP, uppercase
577        self.assertFalse('\U00010401'.islower())
578        self.assertFalse('\U00010427'.islower())
579        # non-BMP, lowercase
580        self.assertTrue('\U00010429'.islower())
581        self.assertTrue('\U0001044E'.islower())
582        # non-BMP, non-cased
583        self.assertFalse('\U0001F40D'.islower())
584        self.assertFalse('\U0001F46F'.islower())
585
586    def test_isupper(self):
587        super().test_isupper()
588        if not sys.platform.startswith('java'):
589            self.checkequalnofix(False, '\u1FFc', 'isupper')
590        self.assertTrue('\u2167'.isupper())
591        self.assertFalse('\u2177'.isupper())
592        # non-BMP, uppercase
593        self.assertTrue('\U00010401'.isupper())
594        self.assertTrue('\U00010427'.isupper())
595        # non-BMP, lowercase
596        self.assertFalse('\U00010429'.isupper())
597        self.assertFalse('\U0001044E'.isupper())
598        # non-BMP, non-cased
599        self.assertFalse('\U0001F40D'.isupper())
600        self.assertFalse('\U0001F46F'.isupper())
601
602    def test_istitle(self):
603        super().test_istitle()
604        self.checkequalnofix(True, '\u1FFc', 'istitle')
605        self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
606
607        # non-BMP, uppercase + lowercase
608        self.assertTrue('\U00010401\U00010429'.istitle())
609        self.assertTrue('\U00010427\U0001044E'.istitle())
610        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
611        for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
612            self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
613
614    def test_isspace(self):
615        super().test_isspace()
616        self.checkequalnofix(True, '\u2000', 'isspace')
617        self.checkequalnofix(True, '\u200a', 'isspace')
618        self.checkequalnofix(False, '\u2014', 'isspace')
619        # There are no non-BMP whitespace chars as of Unicode 12.
620        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
621                   '\U0001F40D', '\U0001F46F']:
622            self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
623
624    @support.requires_resource('cpu')
625    def test_isspace_invariant(self):
626        for codepoint in range(sys.maxunicode + 1):
627            char = chr(codepoint)
628            bidirectional = unicodedata.bidirectional(char)
629            category = unicodedata.category(char)
630            self.assertEqual(char.isspace(),
631                             (bidirectional in ('WS', 'B', 'S')
632                              or category == 'Zs'))
633
634    def test_isalnum(self):
635        super().test_isalnum()
636        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
637                   '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
638            self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
639
640    def test_isalpha(self):
641        super().test_isalpha()
642        self.checkequalnofix(True, '\u1FFc', 'isalpha')
643        # non-BMP, cased
644        self.assertTrue('\U00010401'.isalpha())
645        self.assertTrue('\U00010427'.isalpha())
646        self.assertTrue('\U00010429'.isalpha())
647        self.assertTrue('\U0001044E'.isalpha())
648        # non-BMP, non-cased
649        self.assertFalse('\U0001F40D'.isalpha())
650        self.assertFalse('\U0001F46F'.isalpha())
651
652    def test_isascii(self):
653        super().test_isascii()
654        self.assertFalse("\u20ac".isascii())
655        self.assertFalse("\U0010ffff".isascii())
656
657    def test_isdecimal(self):
658        self.checkequalnofix(False, '', 'isdecimal')
659        self.checkequalnofix(False, 'a', 'isdecimal')
660        self.checkequalnofix(True, '0', 'isdecimal')
661        self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
662        self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
663        self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
664        self.checkequalnofix(True, '0123456789', 'isdecimal')
665        self.checkequalnofix(False, '0123456789a', 'isdecimal')
666
667        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
668
669        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
670                   '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
671            self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
672        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
673            self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
674
675    def test_isdigit(self):
676        super().test_isdigit()
677        self.checkequalnofix(True, '\u2460', 'isdigit')
678        self.checkequalnofix(False, '\xbc', 'isdigit')
679        self.checkequalnofix(True, '\u0660', 'isdigit')
680
681        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
682                   '\U0001F40D', '\U0001F46F', '\U00011065']:
683            self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
684        for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
685            self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
686
687    def test_isnumeric(self):
688        self.checkequalnofix(False, '', 'isnumeric')
689        self.checkequalnofix(False, 'a', 'isnumeric')
690        self.checkequalnofix(True, '0', 'isnumeric')
691        self.checkequalnofix(True, '\u2460', 'isnumeric')
692        self.checkequalnofix(True, '\xbc', 'isnumeric')
693        self.checkequalnofix(True, '\u0660', 'isnumeric')
694        self.checkequalnofix(True, '0123456789', 'isnumeric')
695        self.checkequalnofix(False, '0123456789a', 'isnumeric')
696
697        self.assertRaises(TypeError, "abc".isnumeric, 42)
698
699        for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
700                   '\U0001F40D', '\U0001F46F']:
701            self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
702        for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
703                   '\U000104A0', '\U0001F107']:
704            self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
705
706    def test_isidentifier(self):
707        self.assertTrue("a".isidentifier())
708        self.assertTrue("Z".isidentifier())
709        self.assertTrue("_".isidentifier())
710        self.assertTrue("b0".isidentifier())
711        self.assertTrue("bc".isidentifier())
712        self.assertTrue("b_".isidentifier())
713        self.assertTrue("µ".isidentifier())
714        self.assertTrue("��������������".isidentifier())
715
716        self.assertFalse(" ".isidentifier())
717        self.assertFalse("[".isidentifier())
718        self.assertFalse("©".isidentifier())
719        self.assertFalse("0".isidentifier())
720
721    def test_isprintable(self):
722        self.assertTrue("".isprintable())
723        self.assertTrue(" ".isprintable())
724        self.assertTrue("abcdefg".isprintable())
725        self.assertFalse("abcdefg\n".isprintable())
726        # some defined Unicode character
727        self.assertTrue("\u0374".isprintable())
728        # undefined character
729        self.assertFalse("\u0378".isprintable())
730        # single surrogate character
731        self.assertFalse("\ud800".isprintable())
732
733        self.assertTrue('\U0001F46F'.isprintable())
734        self.assertFalse('\U000E0020'.isprintable())
735
736    def test_surrogates(self):
737        for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
738                  'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
739            self.assertTrue(s.islower())
740            self.assertFalse(s.isupper())
741            self.assertFalse(s.istitle())
742        for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
743                  'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
744            self.assertFalse(s.islower())
745            self.assertTrue(s.isupper())
746            self.assertTrue(s.istitle())
747
748        for meth_name in ('islower', 'isupper', 'istitle'):
749            meth = getattr(str, meth_name)
750            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
751                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
752
753        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
754                          'isdecimal', 'isnumeric',
755                          'isidentifier', 'isprintable'):
756            meth = getattr(str, meth_name)
757            for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
758                      'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
759                      'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
760                self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
761
762
763    def test_lower(self):
764        string_tests.CommonTest.test_lower(self)
765        self.assertEqual('\U00010427'.lower(), '\U0001044F')
766        self.assertEqual('\U00010427\U00010427'.lower(),
767                         '\U0001044F\U0001044F')
768        self.assertEqual('\U00010427\U0001044F'.lower(),
769                         '\U0001044F\U0001044F')
770        self.assertEqual('X\U00010427x\U0001044F'.lower(),
771                         'x\U0001044Fx\U0001044F')
772        self.assertEqual('fi'.lower(), 'fi')
773        self.assertEqual('\u0130'.lower(), '\u0069\u0307')
774        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
775        self.assertEqual('\u03a3'.lower(), '\u03c3')
776        self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
777        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
778        self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
779        self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
780        self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
781        self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
782        self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
783        self.assertEqual('\u2177'.lower(), '\u2177')
784
785    def test_casefold(self):
786        self.assertEqual('hello'.casefold(), 'hello')
787        self.assertEqual('hELlo'.casefold(), 'hello')
788        self.assertEqual('ß'.casefold(), 'ss')
789        self.assertEqual('fi'.casefold(), 'fi')
790        self.assertEqual('\u03a3'.casefold(), '\u03c3')
791        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
792        self.assertEqual('\u00b5'.casefold(), '\u03bc')
793
794    def test_upper(self):
795        string_tests.CommonTest.test_upper(self)
796        self.assertEqual('\U0001044F'.upper(), '\U00010427')
797        self.assertEqual('\U0001044F\U0001044F'.upper(),
798                         '\U00010427\U00010427')
799        self.assertEqual('\U00010427\U0001044F'.upper(),
800                         '\U00010427\U00010427')
801        self.assertEqual('X\U00010427x\U0001044F'.upper(),
802                         'X\U00010427X\U00010427')
803        self.assertEqual('fi'.upper(), 'FI')
804        self.assertEqual('\u0130'.upper(), '\u0130')
805        self.assertEqual('\u03a3'.upper(), '\u03a3')
806        self.assertEqual('ß'.upper(), 'SS')
807        self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
808        self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
809        self.assertEqual('\u2177'.upper(), '\u2167')
810
811    def test_capitalize(self):
812        string_tests.CommonTest.test_capitalize(self)
813        self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
814        self.assertEqual('\U0001044F\U0001044F'.capitalize(),
815                         '\U00010427\U0001044F')
816        self.assertEqual('\U00010427\U0001044F'.capitalize(),
817                         '\U00010427\U0001044F')
818        self.assertEqual('\U0001044F\U00010427'.capitalize(),
819                         '\U00010427\U0001044F')
820        self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
821                         'X\U0001044Fx\U0001044F')
822        self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
823        exp = '\u0399\u0308\u0300\u0069\u0307'
824        self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
825        self.assertEqual('finnish'.capitalize(), 'Finnish')
826        self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
827
828    def test_title(self):
829        super().test_title()
830        self.assertEqual('\U0001044F'.title(), '\U00010427')
831        self.assertEqual('\U0001044F\U0001044F'.title(),
832                         '\U00010427\U0001044F')
833        self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
834                         '\U00010427\U0001044F \U00010427\U0001044F')
835        self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
836                         '\U00010427\U0001044F \U00010427\U0001044F')
837        self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
838                         '\U00010427\U0001044F \U00010427\U0001044F')
839        self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
840                         'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
841        self.assertEqual('fiNNISH'.title(), 'Finnish')
842        self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
843        self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
844
845    def test_swapcase(self):
846        string_tests.CommonTest.test_swapcase(self)
847        self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
848        self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
849        self.assertEqual('\U0001044F\U0001044F'.swapcase(),
850                         '\U00010427\U00010427')
851        self.assertEqual('\U00010427\U0001044F'.swapcase(),
852                         '\U0001044F\U00010427')
853        self.assertEqual('\U0001044F\U00010427'.swapcase(),
854                         '\U00010427\U0001044F')
855        self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
856                         'x\U0001044FX\U00010427')
857        self.assertEqual('fi'.swapcase(), 'FI')
858        self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
859        # Special case for GREEK CAPITAL LETTER SIGMA U+03A3
860        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
861        self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
862        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
863        self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
864        self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
865        self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
866        self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
867        self.assertEqual('\u03a3'.swapcase(), '\u03c3')
868        self.assertEqual('ß'.swapcase(), 'SS')
869        self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
870
871    def test_center(self):
872        string_tests.CommonTest.test_center(self)
873        self.assertEqual('x'.center(2, '\U0010FFFF'),
874                         'x\U0010FFFF')
875        self.assertEqual('x'.center(3, '\U0010FFFF'),
876                         '\U0010FFFFx\U0010FFFF')
877        self.assertEqual('x'.center(4, '\U0010FFFF'),
878                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
879
880    @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
881    @support.cpython_only
882    def test_case_operation_overflow(self):
883        # Issue #22643
884        size = 2**32//12 + 1
885        try:
886            s = "ü" * size
887        except MemoryError:
888            self.skipTest('no enough memory (%.0f MiB required)' % (size / 2**20))
889        try:
890            self.assertRaises(OverflowError, s.upper)
891        finally:
892            del s
893
894    def test_contains(self):
895        # Testing Unicode contains method
896        self.assertIn('a', 'abdb')
897        self.assertIn('a', 'bdab')
898        self.assertIn('a', 'bdaba')
899        self.assertIn('a', 'bdba')
900        self.assertNotIn('a', 'bdb')
901        self.assertIn('a', 'bdba')
902        self.assertIn('a', ('a',1,None))
903        self.assertIn('a', (1,None,'a'))
904        self.assertIn('a', ('a',1,None))
905        self.assertIn('a', (1,None,'a'))
906        self.assertNotIn('a', ('x',1,'y'))
907        self.assertNotIn('a', ('x',1,None))
908        self.assertNotIn('abcd', 'abcxxxx')
909        self.assertIn('ab', 'abcd')
910        self.assertIn('ab', 'abc')
911        self.assertIn('ab', (1,None,'ab'))
912        self.assertIn('', 'abc')
913        self.assertIn('', '')
914        self.assertIn('', 'abc')
915        self.assertNotIn('\0', 'abc')
916        self.assertIn('\0', '\0abc')
917        self.assertIn('\0', 'abc\0')
918        self.assertIn('a', '\0abc')
919        self.assertIn('asdf', 'asdf')
920        self.assertNotIn('asdf', 'asd')
921        self.assertNotIn('asdf', '')
922
923        self.assertRaises(TypeError, "abc".__contains__)
924        # test mixed kinds
925        for fill in ('a', '\u0100', '\U00010300'):
926            fill *= 9
927            for delim in ('c', '\u0102', '\U00010302'):
928                self.assertNotIn(delim, fill)
929                self.assertIn(delim, fill + delim)
930                self.assertNotIn(delim * 2, fill)
931                self.assertIn(delim * 2, fill + delim * 2)
932
933    def test_issue18183(self):
934        '\U00010000\U00100000'.lower()
935        '\U00010000\U00100000'.casefold()
936        '\U00010000\U00100000'.upper()
937        '\U00010000\U00100000'.capitalize()
938        '\U00010000\U00100000'.title()
939        '\U00010000\U00100000'.swapcase()
940        '\U00100000'.center(3, '\U00010000')
941        '\U00100000'.ljust(3, '\U00010000')
942        '\U00100000'.rjust(3, '\U00010000')
943
944    def test_format(self):
945        self.assertEqual(''.format(), '')
946        self.assertEqual('a'.format(), 'a')
947        self.assertEqual('ab'.format(), 'ab')
948        self.assertEqual('a{{'.format(), 'a{')
949        self.assertEqual('a}}'.format(), 'a}')
950        self.assertEqual('{{b'.format(), '{b')
951        self.assertEqual('}}b'.format(), '}b')
952        self.assertEqual('a{{b'.format(), 'a{b')
953
954        # examples from the PEP:
955        import datetime
956        self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
957        self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
958                         "My name is Fred")
959        self.assertEqual("My name is {0} :-{{}}".format('Fred'),
960                         "My name is Fred :-{}")
961
962        d = datetime.date(2007, 8, 18)
963        self.assertEqual("The year is {0.year}".format(d),
964                         "The year is 2007")
965
966        # classes we'll use for testing
967        class C:
968            def __init__(self, x=100):
969                self._x = x
970            def __format__(self, spec):
971                return spec
972
973        class D:
974            def __init__(self, x):
975                self.x = x
976            def __format__(self, spec):
977                return str(self.x)
978
979        # class with __str__, but no __format__
980        class E:
981            def __init__(self, x):
982                self.x = x
983            def __str__(self):
984                return 'E(' + self.x + ')'
985
986        # class with __repr__, but no __format__ or __str__
987        class F:
988            def __init__(self, x):
989                self.x = x
990            def __repr__(self):
991                return 'F(' + self.x + ')'
992
993        # class with __format__ that forwards to string, for some format_spec's
994        class G:
995            def __init__(self, x):
996                self.x = x
997            def __str__(self):
998                return "string is " + self.x
999            def __format__(self, format_spec):
1000                if format_spec == 'd':
1001                    return 'G(' + self.x + ')'
1002                return object.__format__(self, format_spec)
1003
1004        class I(datetime.date):
1005            def __format__(self, format_spec):
1006                return self.strftime(format_spec)
1007
1008        class J(int):
1009            def __format__(self, format_spec):
1010                return int.__format__(self * 2, format_spec)
1011
1012        class M:
1013            def __init__(self, x):
1014                self.x = x
1015            def __repr__(self):
1016                return 'M(' + self.x + ')'
1017            __str__ = None
1018
1019        class N:
1020            def __init__(self, x):
1021                self.x = x
1022            def __repr__(self):
1023                return 'N(' + self.x + ')'
1024            __format__ = None
1025
1026        self.assertEqual(''.format(), '')
1027        self.assertEqual('abc'.format(), 'abc')
1028        self.assertEqual('{0}'.format('abc'), 'abc')
1029        self.assertEqual('{0:}'.format('abc'), 'abc')
1030#        self.assertEqual('{ 0 }'.format('abc'), 'abc')
1031        self.assertEqual('X{0}'.format('abc'), 'Xabc')
1032        self.assertEqual('{0}X'.format('abc'), 'abcX')
1033        self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
1034        self.assertEqual('{1}'.format(1, 'abc'), 'abc')
1035        self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
1036        self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
1037        self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
1038        self.assertEqual('{0}'.format(-15), '-15')
1039        self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
1040        self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
1041        self.assertEqual('{{'.format(), '{')
1042        self.assertEqual('}}'.format(), '}')
1043        self.assertEqual('{{}}'.format(), '{}')
1044        self.assertEqual('{{x}}'.format(), '{x}')
1045        self.assertEqual('{{{0}}}'.format(123), '{123}')
1046        self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
1047        self.assertEqual('}}{{'.format(), '}{')
1048        self.assertEqual('}}x{{'.format(), '}x{')
1049
1050        # weird field names
1051        self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
1052        self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
1053        self.assertEqual("{0[ ]}".format({' ':3}), '3')
1054
1055        self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
1056        self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
1057        self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
1058        self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
1059        self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
1060        self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
1061        self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
1062
1063        # strings
1064        self.assertEqual('{0:.3s}'.format('abc'), 'abc')
1065        self.assertEqual('{0:.3s}'.format('ab'), 'ab')
1066        self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
1067        self.assertEqual('{0:.0s}'.format('abcdef'), '')
1068        self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
1069        self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
1070        self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
1071        self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
1072        self.assertEqual('{0:x<0s}'.format('result'), 'result')
1073        self.assertEqual('{0:x<5s}'.format('result'), 'result')
1074        self.assertEqual('{0:x<6s}'.format('result'), 'result')
1075        self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
1076        self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
1077        self.assertEqual('{0: <7s}'.format('result'), 'result ')
1078        self.assertEqual('{0:<7s}'.format('result'), 'result ')
1079        self.assertEqual('{0:>7s}'.format('result'), ' result')
1080        self.assertEqual('{0:>8s}'.format('result'), '  result')
1081        self.assertEqual('{0:^8s}'.format('result'), ' result ')
1082        self.assertEqual('{0:^9s}'.format('result'), ' result  ')
1083        self.assertEqual('{0:^10s}'.format('result'), '  result  ')
1084        self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
1085        self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
1086        self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
1087
1088        # issue 12546: use \x00 as a fill character
1089        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1090        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1091        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1092        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1093
1094        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1095        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1096        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1097        self.assertEqual('{0:<6}'.format(3), '3     ')
1098
1099        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1100        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1101        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1102        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1103
1104        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1105        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1106        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1107        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1108
1109        # format specifiers for user defined type
1110        self.assertEqual('{0:abc}'.format(C()), 'abc')
1111
1112        # !r, !s and !a coercions
1113        self.assertEqual('{0!s}'.format('Hello'), 'Hello')
1114        self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
1115        self.assertEqual('{0!s:15}'.format('Hello'), 'Hello          ')
1116        self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello          ')
1117        self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
1118        self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
1119        self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
1120        self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
1121        self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'")  # printable
1122        self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
1123        self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
1124        self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
1125        self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
1126        self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
1127        self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
1128        self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
1129
1130        # test fallback to object.__format__
1131        self.assertEqual('{0}'.format({}), '{}')
1132        self.assertEqual('{0}'.format([]), '[]')
1133        self.assertEqual('{0}'.format([1]), '[1]')
1134
1135        self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
1136        self.assertEqual('{0!s}'.format(G('data')), 'string is data')
1137
1138        self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
1139        self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
1140        self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
1141
1142        self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
1143                                                       month=8,
1144                                                       day=27)),
1145                         "date: 2007-08-27")
1146
1147        # test deriving from a builtin type and overriding __format__
1148        self.assertEqual("{0}".format(J(10)), "20")
1149
1150
1151        # string format specifiers
1152        self.assertEqual('{0:}'.format('a'), 'a')
1153
1154        # computed format specifiers
1155        self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
1156        self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
1157        self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
1158        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello     ')
1159        self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello     ')
1160
1161        # test various errors
1162        self.assertRaises(ValueError, '{'.format)
1163        self.assertRaises(ValueError, '}'.format)
1164        self.assertRaises(ValueError, 'a{'.format)
1165        self.assertRaises(ValueError, 'a}'.format)
1166        self.assertRaises(ValueError, '{a'.format)
1167        self.assertRaises(ValueError, '}a'.format)
1168        self.assertRaises(IndexError, '{0}'.format)
1169        self.assertRaises(IndexError, '{1}'.format, 'abc')
1170        self.assertRaises(KeyError,   '{x}'.format)
1171        self.assertRaises(ValueError, "}{".format)
1172        self.assertRaises(ValueError, "abc{0:{}".format)
1173        self.assertRaises(ValueError, "{0".format)
1174        self.assertRaises(IndexError, "{0.}".format)
1175        self.assertRaises(ValueError, "{0.}".format, 0)
1176        self.assertRaises(ValueError, "{0[}".format)
1177        self.assertRaises(ValueError, "{0[}".format, [])
1178        self.assertRaises(KeyError,   "{0]}".format)
1179        self.assertRaises(ValueError, "{0.[]}".format, 0)
1180        self.assertRaises(ValueError, "{0..foo}".format, 0)
1181        self.assertRaises(ValueError, "{0[0}".format, 0)
1182        self.assertRaises(ValueError, "{0[0:foo}".format, 0)
1183        self.assertRaises(KeyError,   "{c]}".format)
1184        self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
1185        self.assertRaises(ValueError, "{0}}".format, 0)
1186        self.assertRaises(KeyError,   "{foo}".format, bar=3)
1187        self.assertRaises(ValueError, "{0!x}".format, 3)
1188        self.assertRaises(ValueError, "{0!}".format, 0)
1189        self.assertRaises(ValueError, "{0!rs}".format, 0)
1190        self.assertRaises(ValueError, "{!}".format)
1191        self.assertRaises(IndexError, "{:}".format)
1192        self.assertRaises(IndexError, "{:s}".format)
1193        self.assertRaises(IndexError, "{}".format)
1194        big = "23098475029384702983476098230754973209482573"
1195        self.assertRaises(ValueError, ("{" + big + "}").format)
1196        self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
1197
1198        # issue 6089
1199        self.assertRaises(ValueError, "{0[0]x}".format, [None])
1200        self.assertRaises(ValueError, "{0[0](10)}".format, [None])
1201
1202        # can't have a replacement on the field name portion
1203        self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
1204
1205        # exceed maximum recursion depth
1206        self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
1207        self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1208                          0, 1, 2, 3, 4, 5, 6, 7)
1209
1210        # string format spec errors
1211        self.assertRaises(ValueError, "{0:-s}".format, '')
1212        self.assertRaises(ValueError, format, "", "-")
1213        self.assertRaises(ValueError, "{0:=s}".format, '')
1214
1215        # Alternate formatting is not supported
1216        self.assertRaises(ValueError, format, '', '#')
1217        self.assertRaises(ValueError, format, '', '#20')
1218
1219        # Non-ASCII
1220        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
1221                         'ABC\u0410\u0411\u0412')
1222        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
1223                         'ABC')
1224        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
1225                         '')
1226
1227        self.assertEqual("{[{}]}".format({"{}": 5}), "5")
1228        self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
1229        self.assertEqual("{[{]}".format({"{" : "a"}), "a")
1230        self.assertEqual("{[}]}".format({"}" : "a"}), "a")
1231        self.assertEqual("{[[]}".format({"[" : "a"}), "a")
1232        self.assertEqual("{[!]}".format({"!" : "a"}), "a")
1233        self.assertRaises(ValueError, "{a{}b}".format, 42)
1234        self.assertRaises(ValueError, "{a{b}".format, 42)
1235        self.assertRaises(ValueError, "{[}".format, 42)
1236
1237        self.assertEqual("0x{:0{:d}X}".format(0x0,16), "0x0000000000000000")
1238
1239        # Blocking fallback
1240        m = M('data')
1241        self.assertEqual("{!r}".format(m), 'M(data)')
1242        self.assertRaises(TypeError, "{!s}".format, m)
1243        self.assertRaises(TypeError, "{}".format, m)
1244        n = N('data')
1245        self.assertEqual("{!r}".format(n), 'N(data)')
1246        self.assertEqual("{!s}".format(n), 'N(data)')
1247        self.assertRaises(TypeError, "{}".format, n)
1248
1249    def test_format_map(self):
1250        self.assertEqual(''.format_map({}), '')
1251        self.assertEqual('a'.format_map({}), 'a')
1252        self.assertEqual('ab'.format_map({}), 'ab')
1253        self.assertEqual('a{{'.format_map({}), 'a{')
1254        self.assertEqual('a}}'.format_map({}), 'a}')
1255        self.assertEqual('{{b'.format_map({}), '{b')
1256        self.assertEqual('}}b'.format_map({}), '}b')
1257        self.assertEqual('a{{b'.format_map({}), 'a{b')
1258
1259        # using mappings
1260        class Mapping(dict):
1261            def __missing__(self, key):
1262                return key
1263        self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
1264        self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
1265
1266        class InternalMapping:
1267            def __init__(self):
1268                self.mapping = {'a': 'hello'}
1269            def __getitem__(self, key):
1270                return self.mapping[key]
1271        self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1272
1273
1274        class C:
1275            def __init__(self, x=100):
1276                self._x = x
1277            def __format__(self, spec):
1278                return spec
1279        self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1280
1281        # test various errors
1282        self.assertRaises(TypeError, ''.format_map)
1283        self.assertRaises(TypeError, 'a'.format_map)
1284
1285        self.assertRaises(ValueError, '{'.format_map, {})
1286        self.assertRaises(ValueError, '}'.format_map, {})
1287        self.assertRaises(ValueError, 'a{'.format_map, {})
1288        self.assertRaises(ValueError, 'a}'.format_map, {})
1289        self.assertRaises(ValueError, '{a'.format_map, {})
1290        self.assertRaises(ValueError, '}a'.format_map, {})
1291
1292        # issue #12579: can't supply positional params to format_map
1293        self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1294        self.assertRaises(ValueError, '{}'.format_map, 'a')
1295        self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1296
1297        class BadMapping:
1298            def __getitem__(self, key):
1299                return 1/0
1300        self.assertRaises(KeyError, '{a}'.format_map, {})
1301        self.assertRaises(TypeError, '{a}'.format_map, [])
1302        self.assertRaises(ZeroDivisionError, '{a}'.format_map, BadMapping())
1303
1304    def test_format_huge_precision(self):
1305        format_string = ".{}f".format(sys.maxsize + 1)
1306        with self.assertRaises(ValueError):
1307            result = format(2.34, format_string)
1308
1309    def test_format_huge_width(self):
1310        format_string = "{}f".format(sys.maxsize + 1)
1311        with self.assertRaises(ValueError):
1312            result = format(2.34, format_string)
1313
1314    def test_format_huge_item_number(self):
1315        format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1316        with self.assertRaises(ValueError):
1317            result = format_string.format(2.34)
1318
1319    def test_format_auto_numbering(self):
1320        class C:
1321            def __init__(self, x=100):
1322                self._x = x
1323            def __format__(self, spec):
1324                return spec
1325
1326        self.assertEqual('{}'.format(10), '10')
1327        self.assertEqual('{:5}'.format('s'), 's    ')
1328        self.assertEqual('{!r}'.format('s'), "'s'")
1329        self.assertEqual('{._x}'.format(C(10)), '10')
1330        self.assertEqual('{[1]}'.format([1, 2]), '2')
1331        self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1332        self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1333
1334        self.assertEqual('a{:{}}b'.format('x', '^10'), 'a    x     b')
1335        self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1336
1337        # can't mix and match numbering and auto-numbering
1338        self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1339        self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1340        self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1341        self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1342
1343        # can mix and match auto-numbering and named
1344        self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1345        self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1346        self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1347        self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1348
1349    def test_formatting(self):
1350        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1351        # Testing Unicode formatting strings...
1352        self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1353        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000,  3.00')
1354        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000,  3.00')
1355        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000,  3.50')
1356        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000,  3.57')
1357        self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1358        if not sys.platform.startswith('java'):
1359            self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1360            self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1361            self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1362        self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1363        self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1364
1365        self.assertEqual('%c' % 0x1234, '\u1234')
1366        self.assertEqual('%c' % 0x21483, '\U00021483')
1367        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1368        self.assertEqual('%c' % '\U00021483', '\U00021483')
1369        self.assertRaises(TypeError, "%c".__mod__, "aa")
1370        self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1371        self.assertRaises(TypeError, "%i".__mod__, "aa")
1372
1373        # formatting jobs delegated from the string implementation:
1374        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1375        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1376        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1377        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1378        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123},  '...abc...')
1379        self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1380        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1381        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1382        self.assertEqual('...%s...' % "abc", '...abc...')
1383        self.assertEqual('%*s' % (5,'abc',), '  abc')
1384        self.assertEqual('%*s' % (-5,'abc',), 'abc  ')
1385        self.assertEqual('%*.*s' % (5,2,'abc',), '   ab')
1386        self.assertEqual('%*.*s' % (5,3,'abc',), '  abc')
1387        self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10   abc')
1388        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103   abc')
1389        self.assertEqual('%c' % 'a', 'a')
1390        class Wrapper:
1391            def __str__(self):
1392                return '\u1234'
1393        self.assertEqual('%s' % Wrapper(), '\u1234')
1394
1395        # issue 3382
1396        NAN = float('nan')
1397        INF = float('inf')
1398        self.assertEqual('%f' % NAN, 'nan')
1399        self.assertEqual('%F' % NAN, 'NAN')
1400        self.assertEqual('%f' % INF, 'inf')
1401        self.assertEqual('%F' % INF, 'INF')
1402
1403        # PEP 393
1404        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1405        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1406
1407        #issue 19995
1408        class PseudoInt:
1409            def __init__(self, value):
1410                self.value = int(value)
1411            def __int__(self):
1412                return self.value
1413            def __index__(self):
1414                return self.value
1415        class PseudoFloat:
1416            def __init__(self, value):
1417                self.value = float(value)
1418            def __int__(self):
1419                return int(self.value)
1420        pi = PseudoFloat(3.1415)
1421        letter_m = PseudoInt(109)
1422        self.assertEqual('%x' % 42, '2a')
1423        self.assertEqual('%X' % 15, 'F')
1424        self.assertEqual('%o' % 9, '11')
1425        self.assertEqual('%c' % 109, 'm')
1426        self.assertEqual('%x' % letter_m, '6d')
1427        self.assertEqual('%X' % letter_m, '6D')
1428        self.assertEqual('%o' % letter_m, '155')
1429        self.assertEqual('%c' % letter_m, 'm')
1430        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not float', operator.mod, '%x', 3.14),
1431        self.assertRaisesRegex(TypeError, '%X format: an integer is required, not float', operator.mod, '%X', 2.11),
1432        self.assertRaisesRegex(TypeError, '%o format: an integer is required, not float', operator.mod, '%o', 1.79),
1433        self.assertRaisesRegex(TypeError, '%x format: an integer is required, not PseudoFloat', operator.mod, '%x', pi),
1434        self.assertRaises(TypeError, operator.mod, '%c', pi),
1435
1436    def test_formatting_with_enum(self):
1437        # issue18780
1438        import enum
1439        class Float(float, enum.Enum):
1440            PI = 3.1415926
1441        class Int(enum.IntEnum):
1442            IDES = 15
1443        class Str(str, enum.Enum):
1444            ABC = 'abc'
1445        # Testing Unicode formatting strings...
1446        self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1447                         'Str.ABC, Str.ABC')
1448        self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1449                        (Str.ABC, Str.ABC,
1450                         Int.IDES, Int.IDES, Int.IDES,
1451                         Float.PI, Float.PI),
1452                         'Str.ABC, Str.ABC, 15, 15, 15, 3.141593,  3.14')
1453
1454        # formatting jobs delegated from the string implementation:
1455        self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1456                         '...Str.ABC...')
1457        self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1458                         '...Int.IDES...')
1459        self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1460                         '...15...')
1461        self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1462                         '...15...')
1463        self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1464                         '...15...')
1465        self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1466                         '...3.141593...')
1467
1468    def test_formatting_huge_precision(self):
1469        format_string = "%.{}f".format(sys.maxsize + 1)
1470        with self.assertRaises(ValueError):
1471            result = format_string % 2.34
1472
1473    def test_issue28598_strsubclass_rhs(self):
1474        # A subclass of str with an __rmod__ method should be able to hook
1475        # into the % operator
1476        class SubclassedStr(str):
1477            def __rmod__(self, other):
1478                return 'Success, self.__rmod__({!r}) was called'.format(other)
1479        self.assertEqual('lhs %% %r' % SubclassedStr('rhs'),
1480                         "Success, self.__rmod__('lhs %% %r') was called")
1481
1482    @support.cpython_only
1483    def test_formatting_huge_precision_c_limits(self):
1484        from _testcapi import INT_MAX
1485        format_string = "%.{}f".format(INT_MAX + 1)
1486        with self.assertRaises(ValueError):
1487            result = format_string % 2.34
1488
1489    def test_formatting_huge_width(self):
1490        format_string = "%{}f".format(sys.maxsize + 1)
1491        with self.assertRaises(ValueError):
1492            result = format_string % 2.34
1493
1494    def test_startswith_endswith_errors(self):
1495        for meth in ('foo'.startswith, 'foo'.endswith):
1496            with self.assertRaises(TypeError) as cm:
1497                meth(['f'])
1498            exc = str(cm.exception)
1499            self.assertIn('str', exc)
1500            self.assertIn('tuple', exc)
1501
1502    @support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1503    def test_format_float(self):
1504        # should not format with a comma, but always with C locale
1505        self.assertEqual('1.0', '%.1f' % 1.0)
1506
1507    def test_constructor(self):
1508        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1509
1510        self.assertEqual(
1511            str('unicode remains unicode'),
1512            'unicode remains unicode'
1513        )
1514
1515        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1516            subclass = StrSubclass(text)
1517            self.assertEqual(str(subclass), text)
1518            self.assertEqual(len(subclass), len(text))
1519            if text == 'ascii':
1520                self.assertEqual(subclass.encode('ascii'), b'ascii')
1521                self.assertEqual(subclass.encode('utf-8'), b'ascii')
1522
1523        self.assertEqual(
1524            str('strings are converted to unicode'),
1525            'strings are converted to unicode'
1526        )
1527
1528        class StringCompat:
1529            def __init__(self, x):
1530                self.x = x
1531            def __str__(self):
1532                return self.x
1533
1534        self.assertEqual(
1535            str(StringCompat('__str__ compatible objects are recognized')),
1536            '__str__ compatible objects are recognized'
1537        )
1538
1539        # unicode(obj) is compatible to str():
1540
1541        o = StringCompat('unicode(obj) is compatible to str()')
1542        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1543        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1544
1545        for obj in (123, 123.45, 123):
1546            self.assertEqual(str(obj), str(str(obj)))
1547
1548        # unicode(obj, encoding, error) tests (this maps to
1549        # PyUnicode_FromEncodedObject() at C level)
1550
1551        if not sys.platform.startswith('java'):
1552            self.assertRaises(
1553                TypeError,
1554                str,
1555                'decoding unicode is not supported',
1556                'utf-8',
1557                'strict'
1558            )
1559
1560        self.assertEqual(
1561            str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1562            'strings are decoded to unicode'
1563        )
1564
1565        if not sys.platform.startswith('java'):
1566            self.assertEqual(
1567                str(
1568                    memoryview(b'character buffers are decoded to unicode'),
1569                    'utf-8',
1570                    'strict'
1571                ),
1572                'character buffers are decoded to unicode'
1573            )
1574
1575        self.assertRaises(TypeError, str, 42, 42, 42)
1576
1577    def test_constructor_keyword_args(self):
1578        """Pass various keyword argument combinations to the constructor."""
1579        # The object argument can be passed as a keyword.
1580        self.assertEqual(str(object='foo'), 'foo')
1581        self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1582        # The errors argument without encoding triggers "decode" mode.
1583        self.assertEqual(str(b'foo', errors='strict'), 'foo')  # not "b'foo'"
1584        self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1585
1586    def test_constructor_defaults(self):
1587        """Check the constructor argument defaults."""
1588        # The object argument defaults to '' or b''.
1589        self.assertEqual(str(), '')
1590        self.assertEqual(str(errors='strict'), '')
1591        utf8_cent = '¢'.encode('utf-8')
1592        # The encoding argument defaults to utf-8.
1593        self.assertEqual(str(utf8_cent, errors='strict'), '¢')
1594        # The errors argument defaults to strict.
1595        self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1596
1597    def test_codecs_utf7(self):
1598        utfTests = [
1599            ('A\u2262\u0391.', b'A+ImIDkQ.'),             # RFC2152 example
1600            ('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'),     # RFC2152 example
1601            ('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'),        # RFC2152 example
1602            ('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1603            ('+', b'+-'),
1604            ('+-', b'+--'),
1605            ('+?', b'+-?'),
1606            (r'\?', b'+AFw?'),
1607            ('+?', b'+-?'),
1608            (r'\\?', b'+AFwAXA?'),
1609            (r'\\\?', b'+AFwAXABc?'),
1610            (r'++--', b'+-+---'),
1611            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
1612            ('/', b'/'),
1613        ]
1614
1615        for (x, y) in utfTests:
1616            self.assertEqual(x.encode('utf-7'), y)
1617
1618        # Unpaired surrogates are passed through
1619        self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1620        self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1621        self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1622        self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1623        self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1624        self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1625        self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1626        self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1627
1628        self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1629        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1630
1631        # Issue #2242: crash on some Windows/MSVC versions
1632        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
1633
1634        # Direct encoded characters
1635        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1636        # Optional direct characters
1637        set_o = '!"#$%&*;<=>@[]^_`{|}'
1638        for c in set_d:
1639            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1640            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1641        for c in set_o:
1642            self.assertEqual(c.encode('ascii').decode('utf7'), c)
1643
1644        with self.assertRaisesRegex(UnicodeDecodeError,
1645                                    'ill-formed sequence'):
1646            b'+@'.decode('utf-7')
1647
1648    def test_codecs_utf8(self):
1649        self.assertEqual(''.encode('utf-8'), b'')
1650        self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1651        self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1652        self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1653        self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1654        self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1655        self.assertEqual(('\U00010002'*10).encode('utf-8'),
1656                         b'\xf0\x90\x80\x82'*10)
1657        self.assertEqual(
1658            '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1659            '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1660            '\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1661            '\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1662            '\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1663            ' Nunstuck git und'.encode('utf-8'),
1664            b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1665            b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1666            b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1667            b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1668            b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1669            b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1670            b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1671            b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1672            b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1673            b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1674        )
1675
1676        # UTF-8 specific decoding tests
1677        self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1678        self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1679        self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1680
1681        # Other possible utf-8 test cases:
1682        # * strict decoding testing for all of the
1683        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
1684
1685    def test_utf8_decode_valid_sequences(self):
1686        sequences = [
1687            # single byte
1688            (b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1689            # 2 bytes
1690            (b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1691            # 3 bytes
1692            (b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1693            (b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1694            # 4 bytes
1695            (b'\xF0\x90\x80\x80', '\U00010000'),
1696            (b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1697        ]
1698        for seq, res in sequences:
1699            self.assertEqual(seq.decode('utf-8'), res)
1700
1701
1702    def test_utf8_decode_invalid_sequences(self):
1703        # continuation bytes in a sequence of 2, 3, or 4 bytes
1704        continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1705        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
1706        invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1707        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
1708        invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1709        invalid_start_bytes = (
1710            continuation_bytes + invalid_2B_seq_start_bytes +
1711            invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1712        )
1713
1714        for byte in invalid_start_bytes:
1715            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1716
1717        for sb in invalid_2B_seq_start_bytes:
1718            for cb in continuation_bytes:
1719                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1720
1721        for sb in invalid_4B_seq_start_bytes:
1722            for cb1 in continuation_bytes[:3]:
1723                for cb3 in continuation_bytes[:3]:
1724                    self.assertRaises(UnicodeDecodeError,
1725                                      (sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1726
1727        for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1728            self.assertRaises(UnicodeDecodeError,
1729                              (b'\xE0'+cb+b'\x80').decode, 'utf-8')
1730            self.assertRaises(UnicodeDecodeError,
1731                              (b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1732        # surrogates
1733        for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1734            self.assertRaises(UnicodeDecodeError,
1735                              (b'\xED'+cb+b'\x80').decode, 'utf-8')
1736            self.assertRaises(UnicodeDecodeError,
1737                              (b'\xED'+cb+b'\xBF').decode, 'utf-8')
1738        for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1739            self.assertRaises(UnicodeDecodeError,
1740                              (b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1741            self.assertRaises(UnicodeDecodeError,
1742                              (b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1743        for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1744            self.assertRaises(UnicodeDecodeError,
1745                              (b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1746            self.assertRaises(UnicodeDecodeError,
1747                              (b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1748
1749    def test_issue8271(self):
1750        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1751        # only the start byte and the continuation byte(s) are now considered
1752        # invalid, instead of the number of bytes specified by the start byte.
1753        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1754        # table 3-8, Row 2) for more information about the algorithm used.
1755        FFFD = '\ufffd'
1756        sequences = [
1757            # invalid start bytes
1758            (b'\x80', FFFD), # continuation byte
1759            (b'\x80\x80', FFFD*2), # 2 continuation bytes
1760            (b'\xc0', FFFD),
1761            (b'\xc0\xc0', FFFD*2),
1762            (b'\xc1', FFFD),
1763            (b'\xc1\xc0', FFFD*2),
1764            (b'\xc0\xc1', FFFD*2),
1765            # with start byte of a 2-byte sequence
1766            (b'\xc2', FFFD), # only the start byte
1767            (b'\xc2\xc2', FFFD*2), # 2 start bytes
1768            (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1769            (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1770            # with start byte of a 3-byte sequence
1771            (b'\xe1', FFFD), # only the start byte
1772            (b'\xe1\xe1', FFFD*2), # 2 start bytes
1773            (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1774            (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1775            (b'\xe1\x80', FFFD), # only 1 continuation byte
1776            (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1777            (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1778            (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1779            (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1780            (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1781            (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1782            # with start byte of a 4-byte sequence
1783            (b'\xf1', FFFD), # only the start byte
1784            (b'\xf1\xf1', FFFD*2), # 2 start bytes
1785            (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1786            (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1787            (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1788            (b'\xf1\x80', FFFD), # only 1 continuation bytes
1789            (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1790            (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1791            (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1792            (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1793            (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1794            (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1795            (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1796            (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1797            (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1798            (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1799            (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1800            (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1801            # with invalid start byte of a 4-byte sequence (rfc2279)
1802            (b'\xf5', FFFD), # only the start byte
1803            (b'\xf5\xf5', FFFD*2), # 2 start bytes
1804            (b'\xf5\x80', FFFD*2), # only 1 continuation byte
1805            (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1806            (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1807            (b'\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
1808            (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1809            (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1810            # with invalid start byte of a 5-byte sequence (rfc2279)
1811            (b'\xf8', FFFD), # only the start byte
1812            (b'\xf8\xf8', FFFD*2), # 2 start bytes
1813            (b'\xf8\x80', FFFD*2), # only one continuation byte
1814            (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1815            (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1816            # with invalid start byte of a 6-byte sequence (rfc2279)
1817            (b'\xfc', FFFD), # only the start byte
1818            (b'\xfc\xfc', FFFD*2), # 2 start bytes
1819            (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1820            (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1821            # invalid start byte
1822            (b'\xfe', FFFD),
1823            (b'\xfe\x80\x80', FFFD*3),
1824            # other sequences
1825            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1826            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1827            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1828            (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1829             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1830        ]
1831        for n, (seq, res) in enumerate(sequences):
1832            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1833            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1834            self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1835            self.assertEqual(seq.decode('utf-8', 'ignore'),
1836                             res.replace('\uFFFD', ''))
1837
1838    def assertCorrectUTF8Decoding(self, seq, res, err):
1839        """
1840        Check that an invalid UTF-8 sequence raises a UnicodeDecodeError when
1841        'strict' is used, returns res when 'replace' is used, and that doesn't
1842        return anything when 'ignore' is used.
1843        """
1844        with self.assertRaises(UnicodeDecodeError) as cm:
1845            seq.decode('utf-8')
1846        exc = cm.exception
1847
1848        self.assertIn(err, str(exc))
1849        self.assertEqual(seq.decode('utf-8', 'replace'), res)
1850        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1851                         'aaaa' + res + 'bbbb')
1852        res = res.replace('\ufffd', '')
1853        self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1854        self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1855                          'aaaa' + res + 'bbbb')
1856
1857    def test_invalid_start_byte(self):
1858        """
1859        Test that an 'invalid start byte' error is raised when the first byte
1860        is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1861        4-bytes sequence. The invalid start byte is replaced with a single
1862        U+FFFD when errors='replace'.
1863        E.g. <80> is a continuation byte and can appear only after a start byte.
1864        """
1865        FFFD = '\ufffd'
1866        for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1867            self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1868                                           'invalid start byte')
1869
1870    def test_unexpected_end_of_data(self):
1871        """
1872        Test that an 'unexpected end of data' error is raised when the string
1873        ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1874        enough continuation bytes.  The incomplete sequence is replaced with a
1875        single U+FFFD when errors='replace'.
1876        E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1877        sequence, but it's followed by only 2 valid continuation bytes and the
1878        last continuation bytes is missing.
1879        Note: the continuation bytes must be all valid, if one of them is
1880        invalid another error will be raised.
1881        """
1882        sequences = [
1883            'C2', 'DF',
1884            'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1885            'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1886            'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1887            'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1888            'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1889            'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1890        ]
1891        FFFD = '\ufffd'
1892        for seq in sequences:
1893            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), '\ufffd',
1894                                           'unexpected end of data')
1895
1896    def test_invalid_cb_for_2bytes_seq(self):
1897        """
1898        Test that an 'invalid continuation byte' error is raised when the
1899        continuation byte of a 2-bytes sequence is invalid.  The start byte
1900        is replaced by a single U+FFFD and the second byte is handled
1901        separately when errors='replace'.
1902        E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1903        sequence, but 41 is not a valid continuation byte because it's the
1904        ASCII letter 'A'.
1905        """
1906        FFFD = '\ufffd'
1907        FFFDx2 = FFFD * 2
1908        sequences = [
1909            ('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1910            ('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1911            ('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1912            ('DF C0', FFFDx2), ('DF FF', FFFDx2),
1913        ]
1914        for seq, res in sequences:
1915            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1916                                           'invalid continuation byte')
1917
1918    def test_invalid_cb_for_3bytes_seq(self):
1919        """
1920        Test that an 'invalid continuation byte' error is raised when the
1921        continuation byte(s) of a 3-bytes sequence are invalid.  When
1922        errors='replace', if the first continuation byte is valid, the first
1923        two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1924        third byte is handled separately, otherwise only the start byte is
1925        replaced with a U+FFFD and the other continuation bytes are handled
1926        separately.
1927        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1928        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1929        because it's the ASCII letter 'A'.
1930        Note: when the start byte is E0 or ED, the valid ranges for the first
1931        continuation byte are limited to A0..BF and 80..9F respectively.
1932        Python 2 used to consider all the bytes in range 80..BF valid when the
1933        start byte was ED.  This is fixed in Python 3.
1934        """
1935        FFFD = '\ufffd'
1936        FFFDx2 = FFFD * 2
1937        sequences = [
1938            ('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1939            ('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1940            ('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1941            ('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1942            ('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1943            ('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1944            ('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1945            ('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1946            ('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1947            ('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1948            ('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1949            ('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1950            ('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1951            ('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1952            ('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1953            ('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1954            ('ED 7F', FFFD+'\x7f'),
1955            ('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1956            ('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1957            ('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1958            ('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1959            ('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1960            ('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1961            ('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1962            ('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1963            ('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1964            ('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1965            ('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1966            ('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1967            ('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1968            ('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1969            ('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1970            ('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1971        ]
1972        for seq, res in sequences:
1973            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
1974                                           'invalid continuation byte')
1975
1976    def test_invalid_cb_for_4bytes_seq(self):
1977        """
1978        Test that an 'invalid continuation byte' error is raised when the
1979        continuation byte(s) of a 4-bytes sequence are invalid.  When
1980        errors='replace',the start byte and all the following valid
1981        continuation bytes are replaced with a single U+FFFD, and all the bytes
1982        starting from the first invalid continuation bytes (included) are
1983        handled separately.
1984        E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1985        sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1986        because it's the ASCII letter 'A'.
1987        Note: when the start byte is E0 or ED, the valid ranges for the first
1988        continuation byte are limited to A0..BF and 80..9F respectively.
1989        However, when the start byte is ED, Python 2 considers all the bytes
1990        in range 80..BF valid.  This is fixed in Python 3.
1991        """
1992        FFFD = '\ufffd'
1993        FFFDx2 = FFFD * 2
1994        sequences = [
1995            ('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1996            ('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1997            ('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1998            ('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1999            ('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
2000            ('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
2001            ('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
2002            ('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
2003            ('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
2004            ('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
2005            ('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
2006            ('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
2007            ('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
2008            ('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
2009            ('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
2010            ('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
2011            ('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
2012            ('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
2013            ('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
2014            ('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
2015            ('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
2016            ('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
2017            ('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
2018            ('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
2019            ('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
2020            ('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
2021            ('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
2022            ('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
2023            ('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
2024            ('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
2025            ('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
2026            ('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
2027            ('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
2028            ('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
2029            ('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
2030            ('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
2031            ('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
2032            ('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
2033            ('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
2034            ('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
2035            ('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
2036            ('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
2037            ('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
2038            ('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
2039            ('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
2040            ('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
2041            ('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
2042            ('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
2043            ('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
2044            ('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
2045            ('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
2046            ('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
2047            ('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
2048            ('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
2049            ('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
2050        ]
2051        for seq, res in sequences:
2052            self.assertCorrectUTF8Decoding(bytes.fromhex(seq), res,
2053                                           'invalid continuation byte')
2054
2055    def test_codecs_idna(self):
2056        # Test whether trailing dot is preserved
2057        self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
2058
2059    def test_codecs_errors(self):
2060        # Error handling (encoding)
2061        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
2062        self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
2063        self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
2064        self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
2065        self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
2066                         'Andr\202 x'.encode('ascii', errors='replace'))
2067        self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
2068                         'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
2069
2070        # Error handling (decoding)
2071        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
2072        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
2073        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
2074        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
2075        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
2076
2077        # Error handling (unknown character names)
2078        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
2079
2080        # Error handling (truncated escape sequence)
2081        self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
2082
2083        self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
2084        self.assertRaises(TypeError, str, b"hello", "test.unicode2")
2085        self.assertRaises(TypeError, "hello".encode, "test.unicode1")
2086        self.assertRaises(TypeError, "hello".encode, "test.unicode2")
2087
2088        # Error handling (wrong arguments)
2089        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
2090
2091        # Error handling (lone surrogate in
2092        # _PyUnicode_TransformDecimalAndSpaceToASCII())
2093        self.assertRaises(ValueError, int, "\ud800")
2094        self.assertRaises(ValueError, int, "\udf00")
2095        self.assertRaises(ValueError, float, "\ud800")
2096        self.assertRaises(ValueError, float, "\udf00")
2097        self.assertRaises(ValueError, complex, "\ud800")
2098        self.assertRaises(ValueError, complex, "\udf00")
2099
2100    def test_codecs(self):
2101        # Encoding
2102        self.assertEqual('hello'.encode('ascii'), b'hello')
2103        self.assertEqual('hello'.encode('utf-7'), b'hello')
2104        self.assertEqual('hello'.encode('utf-8'), b'hello')
2105        self.assertEqual('hello'.encode('utf-8'), b'hello')
2106        self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
2107        self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
2108        self.assertEqual('hello'.encode('latin-1'), b'hello')
2109
2110        # Default encoding is utf-8
2111        self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
2112
2113        # Roundtrip safety for BMP (just the first 1024 chars)
2114        for c in range(1024):
2115            u = chr(c)
2116            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
2117                             'utf-16-be', 'raw_unicode_escape',
2118                             'unicode_escape'):
2119                self.assertEqual(str(u.encode(encoding),encoding), u)
2120
2121        # Roundtrip safety for BMP (just the first 256 chars)
2122        for c in range(256):
2123            u = chr(c)
2124            for encoding in ('latin-1',):
2125                self.assertEqual(str(u.encode(encoding),encoding), u)
2126
2127        # Roundtrip safety for BMP (just the first 128 chars)
2128        for c in range(128):
2129            u = chr(c)
2130            for encoding in ('ascii',):
2131                self.assertEqual(str(u.encode(encoding),encoding), u)
2132
2133        # Roundtrip safety for non-BMP (just a few chars)
2134        with warnings.catch_warnings():
2135            u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
2136            for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
2137                             'raw_unicode_escape', 'unicode_escape'):
2138                self.assertEqual(str(u.encode(encoding),encoding), u)
2139
2140        # UTF-8 must be roundtrip safe for all code points
2141        # (except surrogates, which are forbidden).
2142        u = ''.join(map(chr, list(range(0, 0xd800)) +
2143                             list(range(0xe000, 0x110000))))
2144        for encoding in ('utf-8',):
2145            self.assertEqual(str(u.encode(encoding),encoding), u)
2146
2147    def test_codecs_charmap(self):
2148        # 0-127
2149        s = bytes(range(128))
2150        for encoding in (
2151            'cp037', 'cp1026', 'cp273',
2152            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2153            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2154            'cp863', 'cp865', 'cp866', 'cp1125',
2155            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2156            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
2157            'iso8859_7', 'iso8859_9',
2158            'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'latin_1',
2159            'mac_cyrillic', 'mac_latin2',
2160
2161            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2162            'cp1256', 'cp1257', 'cp1258',
2163            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2164
2165            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2166            'cp1006', 'iso8859_8',
2167
2168            ### These have undefined mappings:
2169            #'cp424',
2170
2171            ### These fail the round-trip:
2172            #'cp875'
2173
2174            ):
2175            self.assertEqual(str(s, encoding).encode(encoding), s)
2176
2177        # 128-255
2178        s = bytes(range(128, 256))
2179        for encoding in (
2180            'cp037', 'cp1026', 'cp273',
2181            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
2182            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
2183            'cp863', 'cp865', 'cp866', 'cp1125',
2184            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
2185            'iso8859_2', 'iso8859_4', 'iso8859_5',
2186            'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1',
2187            'mac_cyrillic', 'mac_latin2',
2188
2189            ### These have undefined mappings:
2190            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
2191            #'cp1256', 'cp1257', 'cp1258',
2192            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
2193            #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', 'kz1048',
2194            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
2195
2196            ### These fail the round-trip:
2197            #'cp1006', 'cp875', 'iso8859_8',
2198
2199            ):
2200            self.assertEqual(str(s, encoding).encode(encoding), s)
2201
2202    def test_concatenation(self):
2203        self.assertEqual(("abc" "def"), "abcdef")
2204        self.assertEqual(("abc" "def"), "abcdef")
2205        self.assertEqual(("abc" "def"), "abcdef")
2206        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2207        self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
2208
2209    def test_printing(self):
2210        class BitBucket:
2211            def write(self, text):
2212                pass
2213
2214        out = BitBucket()
2215        print('abc', file=out)
2216        print('abc', 'def', file=out)
2217        print('abc', 'def', file=out)
2218        print('abc', 'def', file=out)
2219        print('abc\n', file=out)
2220        print('abc\n', end=' ', file=out)
2221        print('abc\n', end=' ', file=out)
2222        print('def\n', file=out)
2223        print('def\n', file=out)
2224
2225    def test_ucs4(self):
2226        x = '\U00100000'
2227        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
2228        self.assertEqual(x, y)
2229
2230        y = br'\U00100000'
2231        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2232        self.assertEqual(x, y)
2233        y = br'\U00010000'
2234        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
2235        self.assertEqual(x, y)
2236
2237        try:
2238            br'\U11111111'.decode("raw-unicode-escape")
2239        except UnicodeDecodeError as e:
2240            self.assertEqual(e.start, 0)
2241            self.assertEqual(e.end, 10)
2242        else:
2243            self.fail("Should have raised UnicodeDecodeError")
2244
2245    def test_conversion(self):
2246        # Make sure __str__() works properly
2247        class ObjectToStr:
2248            def __str__(self):
2249                return "foo"
2250
2251        class StrSubclassToStr(str):
2252            def __str__(self):
2253                return "foo"
2254
2255        class StrSubclassToStrSubclass(str):
2256            def __new__(cls, content=""):
2257                return str.__new__(cls, 2*content)
2258            def __str__(self):
2259                return self
2260
2261        self.assertEqual(str(ObjectToStr()), "foo")
2262        self.assertEqual(str(StrSubclassToStr("bar")), "foo")
2263        s = str(StrSubclassToStrSubclass("foo"))
2264        self.assertEqual(s, "foofoo")
2265        self.assertIs(type(s), StrSubclassToStrSubclass)
2266        s = StrSubclass(StrSubclassToStrSubclass("foo"))
2267        self.assertEqual(s, "foofoo")
2268        self.assertIs(type(s), StrSubclass)
2269
2270    def test_unicode_repr(self):
2271        class s1:
2272            def __repr__(self):
2273                return '\\n'
2274
2275        class s2:
2276            def __repr__(self):
2277                return '\\n'
2278
2279        self.assertEqual(repr(s1()), '\\n')
2280        self.assertEqual(repr(s2()), '\\n')
2281
2282    def test_printable_repr(self):
2283        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2284        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
2285
2286    # This test only affects 32-bit platforms because expandtabs can only take
2287    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
2288    # to take a 64-bit long, this test should apply to all platforms.
2289    @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
2290                     'only applies to 32-bit platforms')
2291    def test_expandtabs_overflows_gracefully(self):
2292        self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2293
2294    @support.cpython_only
2295    def test_expandtabs_optimization(self):
2296        s = 'abc'
2297        self.assertIs(s.expandtabs(), s)
2298
2299    def test_raiseMemError(self):
2300        if struct.calcsize('P') == 8:
2301            # 64 bits pointers
2302            ascii_struct_size = 48
2303            compact_struct_size = 72
2304        else:
2305            # 32 bits pointers
2306            ascii_struct_size = 24
2307            compact_struct_size = 36
2308
2309        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2310            code = ord(char)
2311            if code < 0x100:
2312                char_size = 1  # sizeof(Py_UCS1)
2313                struct_size = ascii_struct_size
2314            elif code < 0x10000:
2315                char_size = 2  # sizeof(Py_UCS2)
2316                struct_size = compact_struct_size
2317            else:
2318                char_size = 4  # sizeof(Py_UCS4)
2319                struct_size = compact_struct_size
2320            # Note: sys.maxsize is half of the actual max allocation because of
2321            # the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2322            # be allocatable, given enough memory.
2323            maxlen = ((sys.maxsize - struct_size) // char_size)
2324            alloc = lambda: char * maxlen
2325            self.assertRaises(MemoryError, alloc)
2326            self.assertRaises(MemoryError, alloc)
2327
2328    def test_format_subclass(self):
2329        class S(str):
2330            def __str__(self):
2331                return '__str__ overridden'
2332        s = S('xxx')
2333        self.assertEqual("%s" % s, '__str__ overridden')
2334        self.assertEqual("{}".format(s), '__str__ overridden')
2335
2336    def test_subclass_add(self):
2337        class S(str):
2338            def __add__(self, o):
2339                return "3"
2340        self.assertEqual(S("4") + S("5"), "3")
2341        class S(str):
2342            def __iadd__(self, o):
2343                return "3"
2344        s = S("1")
2345        s += "4"
2346        self.assertEqual(s, "3")
2347
2348    def test_getnewargs(self):
2349        text = 'abc'
2350        args = text.__getnewargs__()
2351        self.assertIsNot(args[0], text)
2352        self.assertEqual(args[0], text)
2353        self.assertEqual(len(args), 1)
2354
2355    @support.cpython_only
2356    def test_resize(self):
2357        from _testcapi import getargs_u
2358        for length in range(1, 100, 7):
2359            # generate a fresh string (refcount=1)
2360            text = 'a' * length + 'b'
2361
2362            # fill wstr internal field
2363            abc = getargs_u(text)
2364            self.assertEqual(abc, text)
2365
2366            # resize text: wstr field must be cleared and then recomputed
2367            text += 'c'
2368            abcdef = getargs_u(text)
2369            self.assertNotEqual(abc, abcdef)
2370            self.assertEqual(abcdef, text)
2371
2372    def test_compare(self):
2373        # Issue #17615
2374        N = 10
2375        ascii = 'a' * N
2376        ascii2 = 'z' * N
2377        latin = '\x80' * N
2378        latin2 = '\xff' * N
2379        bmp = '\u0100' * N
2380        bmp2 = '\uffff' * N
2381        astral = '\U00100000' * N
2382        astral2 = '\U0010ffff' * N
2383        strings = (
2384            ascii, ascii2,
2385            latin, latin2,
2386            bmp, bmp2,
2387            astral, astral2)
2388        for text1, text2 in itertools.combinations(strings, 2):
2389            equal = (text1 is text2)
2390            self.assertEqual(text1 == text2, equal)
2391            self.assertEqual(text1 != text2, not equal)
2392
2393            if equal:
2394                self.assertTrue(text1 <= text2)
2395                self.assertTrue(text1 >= text2)
2396
2397                # text1 is text2: duplicate strings to skip the "str1 == str2"
2398                # optimization in unicode_compare_eq() and really compare
2399                # character per character
2400                copy1 = duplicate_string(text1)
2401                copy2 = duplicate_string(text2)
2402                self.assertIsNot(copy1, copy2)
2403
2404                self.assertTrue(copy1 == copy2)
2405                self.assertFalse(copy1 != copy2)
2406
2407                self.assertTrue(copy1 <= copy2)
2408                self.assertTrue(copy2 >= copy2)
2409
2410        self.assertTrue(ascii < ascii2)
2411        self.assertTrue(ascii < latin)
2412        self.assertTrue(ascii < bmp)
2413        self.assertTrue(ascii < astral)
2414        self.assertFalse(ascii >= ascii2)
2415        self.assertFalse(ascii >= latin)
2416        self.assertFalse(ascii >= bmp)
2417        self.assertFalse(ascii >= astral)
2418
2419        self.assertFalse(latin < ascii)
2420        self.assertTrue(latin < latin2)
2421        self.assertTrue(latin < bmp)
2422        self.assertTrue(latin < astral)
2423        self.assertTrue(latin >= ascii)
2424        self.assertFalse(latin >= latin2)
2425        self.assertFalse(latin >= bmp)
2426        self.assertFalse(latin >= astral)
2427
2428        self.assertFalse(bmp < ascii)
2429        self.assertFalse(bmp < latin)
2430        self.assertTrue(bmp < bmp2)
2431        self.assertTrue(bmp < astral)
2432        self.assertTrue(bmp >= ascii)
2433        self.assertTrue(bmp >= latin)
2434        self.assertFalse(bmp >= bmp2)
2435        self.assertFalse(bmp >= astral)
2436
2437        self.assertFalse(astral < ascii)
2438        self.assertFalse(astral < latin)
2439        self.assertFalse(astral < bmp2)
2440        self.assertTrue(astral < astral2)
2441        self.assertTrue(astral >= ascii)
2442        self.assertTrue(astral >= latin)
2443        self.assertTrue(astral >= bmp2)
2444        self.assertFalse(astral >= astral2)
2445
2446    def test_free_after_iterating(self):
2447        support.check_free_after_iterating(self, iter, str)
2448        support.check_free_after_iterating(self, reversed, str)
2449
2450
2451class CAPITest(unittest.TestCase):
2452
2453    # Test PyUnicode_FromFormat()
2454    def test_from_format(self):
2455        support.import_module('ctypes')
2456        from ctypes import (
2457            pythonapi, py_object, sizeof,
2458            c_int, c_long, c_longlong, c_ssize_t,
2459            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2460        name = "PyUnicode_FromFormat"
2461        _PyUnicode_FromFormat = getattr(pythonapi, name)
2462        _PyUnicode_FromFormat.restype = py_object
2463
2464        def PyUnicode_FromFormat(format, *args):
2465            cargs = tuple(
2466                py_object(arg) if isinstance(arg, str) else arg
2467                for arg in args)
2468            return _PyUnicode_FromFormat(format, *cargs)
2469
2470        def check_format(expected, format, *args):
2471            text = PyUnicode_FromFormat(format, *args)
2472            self.assertEqual(expected, text)
2473
2474        # ascii format, non-ascii argument
2475        check_format('ascii\x7f=unicode\xe9',
2476                     b'ascii\x7f=%U', 'unicode\xe9')
2477
2478        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2479        # raises an error
2480        self.assertRaisesRegex(ValueError,
2481            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2482            'string, got a non-ASCII byte: 0xe9$',
2483            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2484
2485        # test "%c"
2486        check_format('\uabcd',
2487                     b'%c', c_int(0xabcd))
2488        check_format('\U0010ffff',
2489                     b'%c', c_int(0x10ffff))
2490        with self.assertRaises(OverflowError):
2491            PyUnicode_FromFormat(b'%c', c_int(0x110000))
2492        # Issue #18183
2493        check_format('\U00010000\U00100000',
2494                     b'%c%c', c_int(0x10000), c_int(0x100000))
2495
2496        # test "%"
2497        check_format('%',
2498                     b'%')
2499        check_format('%',
2500                     b'%%')
2501        check_format('%s',
2502                     b'%%s')
2503        check_format('[%]',
2504                     b'[%%]')
2505        check_format('%abc',
2506                     b'%%%s', b'abc')
2507
2508        # truncated string
2509        check_format('abc',
2510                     b'%.3s', b'abcdef')
2511        check_format('abc[\ufffd',
2512                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2513        check_format("'\\u20acABC'",
2514                     b'%A', '\u20acABC')
2515        check_format("'\\u20",
2516                     b'%.5A', '\u20acABCDEF')
2517        check_format("'\u20acABC'",
2518                     b'%R', '\u20acABC')
2519        check_format("'\u20acA",
2520                     b'%.3R', '\u20acABCDEF')
2521        check_format('\u20acAB',
2522                     b'%.3S', '\u20acABCDEF')
2523        check_format('\u20acAB',
2524                     b'%.3U', '\u20acABCDEF')
2525        check_format('\u20acAB',
2526                     b'%.3V', '\u20acABCDEF', None)
2527        check_format('abc[\ufffd',
2528                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2529
2530        # following tests comes from #7330
2531        # test width modifier and precision modifier with %S
2532        check_format("repr=  abc",
2533                     b'repr=%5S', 'abc')
2534        check_format("repr=ab",
2535                     b'repr=%.2S', 'abc')
2536        check_format("repr=   ab",
2537                     b'repr=%5.2S', 'abc')
2538
2539        # test width modifier and precision modifier with %R
2540        check_format("repr=   'abc'",
2541                     b'repr=%8R', 'abc')
2542        check_format("repr='ab",
2543                     b'repr=%.3R', 'abc')
2544        check_format("repr=  'ab",
2545                     b'repr=%5.3R', 'abc')
2546
2547        # test width modifier and precision modifier with %A
2548        check_format("repr=   'abc'",
2549                     b'repr=%8A', 'abc')
2550        check_format("repr='ab",
2551                     b'repr=%.3A', 'abc')
2552        check_format("repr=  'ab",
2553                     b'repr=%5.3A', 'abc')
2554
2555        # test width modifier and precision modifier with %s
2556        check_format("repr=  abc",
2557                     b'repr=%5s', b'abc')
2558        check_format("repr=ab",
2559                     b'repr=%.2s', b'abc')
2560        check_format("repr=   ab",
2561                     b'repr=%5.2s', b'abc')
2562
2563        # test width modifier and precision modifier with %U
2564        check_format("repr=  abc",
2565                     b'repr=%5U', 'abc')
2566        check_format("repr=ab",
2567                     b'repr=%.2U', 'abc')
2568        check_format("repr=   ab",
2569                     b'repr=%5.2U', 'abc')
2570
2571        # test width modifier and precision modifier with %V
2572        check_format("repr=  abc",
2573                     b'repr=%5V', 'abc', b'123')
2574        check_format("repr=ab",
2575                     b'repr=%.2V', 'abc', b'123')
2576        check_format("repr=   ab",
2577                     b'repr=%5.2V', 'abc', b'123')
2578        check_format("repr=  123",
2579                     b'repr=%5V', None, b'123')
2580        check_format("repr=12",
2581                     b'repr=%.2V', None, b'123')
2582        check_format("repr=   12",
2583                     b'repr=%5.2V', None, b'123')
2584
2585        # test integer formats (%i, %d, %u)
2586        check_format('010',
2587                     b'%03i', c_int(10))
2588        check_format('0010',
2589                     b'%0.4i', c_int(10))
2590        check_format('-123',
2591                     b'%i', c_int(-123))
2592        check_format('-123',
2593                     b'%li', c_long(-123))
2594        check_format('-123',
2595                     b'%lli', c_longlong(-123))
2596        check_format('-123',
2597                     b'%zi', c_ssize_t(-123))
2598
2599        check_format('-123',
2600                     b'%d', c_int(-123))
2601        check_format('-123',
2602                     b'%ld', c_long(-123))
2603        check_format('-123',
2604                     b'%lld', c_longlong(-123))
2605        check_format('-123',
2606                     b'%zd', c_ssize_t(-123))
2607
2608        check_format('123',
2609                     b'%u', c_uint(123))
2610        check_format('123',
2611                     b'%lu', c_ulong(123))
2612        check_format('123',
2613                     b'%llu', c_ulonglong(123))
2614        check_format('123',
2615                     b'%zu', c_size_t(123))
2616
2617        # test long output
2618        min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2619        max_longlong = -min_longlong - 1
2620        check_format(str(min_longlong),
2621                     b'%lld', c_longlong(min_longlong))
2622        check_format(str(max_longlong),
2623                     b'%lld', c_longlong(max_longlong))
2624        max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2625        check_format(str(max_ulonglong),
2626                     b'%llu', c_ulonglong(max_ulonglong))
2627        PyUnicode_FromFormat(b'%p', c_void_p(-1))
2628
2629        # test padding (width and/or precision)
2630        check_format('123'.rjust(10, '0'),
2631                     b'%010i', c_int(123))
2632        check_format('123'.rjust(100),
2633                     b'%100i', c_int(123))
2634        check_format('123'.rjust(100, '0'),
2635                     b'%.100i', c_int(123))
2636        check_format('123'.rjust(80, '0').rjust(100),
2637                     b'%100.80i', c_int(123))
2638
2639        check_format('123'.rjust(10, '0'),
2640                     b'%010u', c_uint(123))
2641        check_format('123'.rjust(100),
2642                     b'%100u', c_uint(123))
2643        check_format('123'.rjust(100, '0'),
2644                     b'%.100u', c_uint(123))
2645        check_format('123'.rjust(80, '0').rjust(100),
2646                     b'%100.80u', c_uint(123))
2647
2648        check_format('123'.rjust(10, '0'),
2649                     b'%010x', c_int(0x123))
2650        check_format('123'.rjust(100),
2651                     b'%100x', c_int(0x123))
2652        check_format('123'.rjust(100, '0'),
2653                     b'%.100x', c_int(0x123))
2654        check_format('123'.rjust(80, '0').rjust(100),
2655                     b'%100.80x', c_int(0x123))
2656
2657        # test %A
2658        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2659                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2660
2661        # test %V
2662        check_format('repr=abc',
2663                     b'repr=%V', 'abc', b'xyz')
2664
2665        # Test string decode from parameter of %s using utf-8.
2666        # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2667        # '\u4eba\u6c11'
2668        check_format('repr=\u4eba\u6c11',
2669                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2670
2671        #Test replace error handler.
2672        check_format('repr=abc\ufffd',
2673                     b'repr=%V', None, b'abc\xff')
2674
2675        # not supported: copy the raw format string. these tests are just here
2676        # to check for crashes and should not be considered as specifications
2677        check_format('%s',
2678                     b'%1%s', b'abc')
2679        check_format('%1abc',
2680                     b'%1abc')
2681        check_format('%+i',
2682                     b'%+i', c_int(10))
2683        check_format('%.%s',
2684                     b'%.%s', b'abc')
2685
2686        # Issue #33817: empty strings
2687        check_format('',
2688                     b'')
2689        check_format('',
2690                     b'%s', b'')
2691
2692    # Test PyUnicode_AsWideChar()
2693    @support.cpython_only
2694    def test_aswidechar(self):
2695        from _testcapi import unicode_aswidechar
2696        support.import_module('ctypes')
2697        from ctypes import c_wchar, sizeof
2698
2699        wchar, size = unicode_aswidechar('abcdef', 2)
2700        self.assertEqual(size, 2)
2701        self.assertEqual(wchar, 'ab')
2702
2703        wchar, size = unicode_aswidechar('abc', 3)
2704        self.assertEqual(size, 3)
2705        self.assertEqual(wchar, 'abc')
2706
2707        wchar, size = unicode_aswidechar('abc', 4)
2708        self.assertEqual(size, 3)
2709        self.assertEqual(wchar, 'abc\0')
2710
2711        wchar, size = unicode_aswidechar('abc', 10)
2712        self.assertEqual(size, 3)
2713        self.assertEqual(wchar, 'abc\0')
2714
2715        wchar, size = unicode_aswidechar('abc\0def', 20)
2716        self.assertEqual(size, 7)
2717        self.assertEqual(wchar, 'abc\0def\0')
2718
2719        nonbmp = chr(0x10ffff)
2720        if sizeof(c_wchar) == 2:
2721            buflen = 3
2722            nchar = 2
2723        else: # sizeof(c_wchar) == 4
2724            buflen = 2
2725            nchar = 1
2726        wchar, size = unicode_aswidechar(nonbmp, buflen)
2727        self.assertEqual(size, nchar)
2728        self.assertEqual(wchar, nonbmp + '\0')
2729
2730    # Test PyUnicode_AsWideCharString()
2731    @support.cpython_only
2732    def test_aswidecharstring(self):
2733        from _testcapi import unicode_aswidecharstring
2734        support.import_module('ctypes')
2735        from ctypes import c_wchar, sizeof
2736
2737        wchar, size = unicode_aswidecharstring('abc')
2738        self.assertEqual(size, 3)
2739        self.assertEqual(wchar, 'abc\0')
2740
2741        wchar, size = unicode_aswidecharstring('abc\0def')
2742        self.assertEqual(size, 7)
2743        self.assertEqual(wchar, 'abc\0def\0')
2744
2745        nonbmp = chr(0x10ffff)
2746        if sizeof(c_wchar) == 2:
2747            nchar = 2
2748        else: # sizeof(c_wchar) == 4
2749            nchar = 1
2750        wchar, size = unicode_aswidecharstring(nonbmp)
2751        self.assertEqual(size, nchar)
2752        self.assertEqual(wchar, nonbmp + '\0')
2753
2754    # Test PyUnicode_AsUCS4()
2755    @support.cpython_only
2756    def test_asucs4(self):
2757        from _testcapi import unicode_asucs4
2758        for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
2759                  'a\ud800b\udfffc', '\ud834\udd1e']:
2760            l = len(s)
2761            self.assertEqual(unicode_asucs4(s, l, 1), s+'\0')
2762            self.assertEqual(unicode_asucs4(s, l, 0), s+'\uffff')
2763            self.assertEqual(unicode_asucs4(s, l+1, 1), s+'\0\uffff')
2764            self.assertEqual(unicode_asucs4(s, l+1, 0), s+'\0\uffff')
2765            self.assertRaises(SystemError, unicode_asucs4, s, l-1, 1)
2766            self.assertRaises(SystemError, unicode_asucs4, s, l-2, 0)
2767            s = '\0'.join([s, s])
2768            self.assertEqual(unicode_asucs4(s, len(s), 1), s+'\0')
2769            self.assertEqual(unicode_asucs4(s, len(s), 0), s+'\uffff')
2770
2771    # Test PyUnicode_FindChar()
2772    @support.cpython_only
2773    def test_findchar(self):
2774        from _testcapi import unicode_findchar
2775
2776        for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
2777            for i, ch in enumerate(str):
2778                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
2779                self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
2780
2781        str = "!>_<!"
2782        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
2783        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
2784        # start < end
2785        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
2786        self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
2787        # start >= end
2788        self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
2789        self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
2790        # negative
2791        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
2792        self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
2793
2794    # Test PyUnicode_CopyCharacters()
2795    @support.cpython_only
2796    def test_copycharacters(self):
2797        from _testcapi import unicode_copycharacters
2798
2799        strings = [
2800            'abcde', '\xa1\xa2\xa3\xa4\xa5',
2801            '\u4f60\u597d\u4e16\u754c\uff01',
2802            '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
2803        ]
2804
2805        for idx, from_ in enumerate(strings):
2806            # wide -> narrow: exceed maxchar limitation
2807            for to in strings[:idx]:
2808                self.assertRaises(
2809                    SystemError,
2810                    unicode_copycharacters, to, 0, from_, 0, 5
2811                )
2812            # same kind
2813            for from_start in range(5):
2814                self.assertEqual(
2815                    unicode_copycharacters(from_, 0, from_, from_start, 5),
2816                    (from_[from_start:from_start+5].ljust(5, '\0'),
2817                     5-from_start)
2818                )
2819            for to_start in range(5):
2820                self.assertEqual(
2821                    unicode_copycharacters(from_, to_start, from_, to_start, 5),
2822                    (from_[to_start:to_start+5].rjust(5, '\0'),
2823                     5-to_start)
2824                )
2825            # narrow -> wide
2826            # Tests omitted since this creates invalid strings.
2827
2828        s = strings[0]
2829        self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
2830        self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
2831        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
2832        self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
2833        self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
2834        self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
2835        self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
2836
2837    @support.cpython_only
2838    def test_encode_decimal(self):
2839        from _testcapi import unicode_encodedecimal
2840        self.assertEqual(unicode_encodedecimal('123'),
2841                         b'123')
2842        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2843                         b'3.14')
2844        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2845                         b' 3.14 ')
2846        self.assertRaises(UnicodeEncodeError,
2847                          unicode_encodedecimal, "123\u20ac", "strict")
2848        self.assertRaisesRegex(
2849            ValueError,
2850            "^'decimal' codec can't encode character",
2851            unicode_encodedecimal, "123\u20ac", "replace")
2852
2853    @support.cpython_only
2854    def test_transform_decimal(self):
2855        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2856        self.assertEqual(transform_decimal('123'),
2857                         '123')
2858        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2859                         '3.14')
2860        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2861                         "\N{EM SPACE}3.14\N{EN SPACE}")
2862        self.assertEqual(transform_decimal('123\u20ac'),
2863                         '123\u20ac')
2864
2865    @support.cpython_only
2866    def test_pep393_utf8_caching_bug(self):
2867        # Issue #25709: Problem with string concatenation and utf-8 cache
2868        from _testcapi import getargs_s_hash
2869        for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
2870            s = ''
2871            for i in range(5):
2872                # Due to CPython specific optimization the 's' string can be
2873                # resized in-place.
2874                s += chr(k)
2875                # Parsing with the "s#" format code calls indirectly
2876                # PyUnicode_AsUTF8AndSize() which creates the UTF-8
2877                # encoded string cached in the Unicode object.
2878                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2879                # Check that the second call returns the same result
2880                self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
2881
2882class StringModuleTest(unittest.TestCase):
2883    def test_formatter_parser(self):
2884        def parse(format):
2885            return list(_string.formatter_parser(format))
2886
2887        formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2888        self.assertEqual(formatter, [
2889            ('prefix ', '2', '', 's'),
2890            ('xxx', '0', '^+10.3f', None),
2891            ('', 'obj.attr', '', 's'),
2892            (' ', 'z[0]', '10', 's'),
2893        ])
2894
2895        formatter = parse("prefix {} suffix")
2896        self.assertEqual(formatter, [
2897            ('prefix ', '', '', None),
2898            (' suffix', None, None, None),
2899        ])
2900
2901        formatter = parse("str")
2902        self.assertEqual(formatter, [
2903            ('str', None, None, None),
2904        ])
2905
2906        formatter = parse("")
2907        self.assertEqual(formatter, [])
2908
2909        formatter = parse("{0}")
2910        self.assertEqual(formatter, [
2911            ('', '0', '', None),
2912        ])
2913
2914        self.assertRaises(TypeError, _string.formatter_parser, 1)
2915
2916    def test_formatter_field_name_split(self):
2917        def split(name):
2918            items = list(_string.formatter_field_name_split(name))
2919            items[1] = list(items[1])
2920            return items
2921        self.assertEqual(split("obj"), ["obj", []])
2922        self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2923        self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2924        self.assertEqual(split("obj.arg[key1][key2]"), [
2925            "obj",
2926            [(True, 'arg'),
2927             (False, 'key1'),
2928             (False, 'key2'),
2929            ]])
2930        self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2931
2932
2933if __name__ == "__main__":
2934    unittest.main()
2935