• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
2
3import locale
4import os
5import subprocess
6import sys
7import sysconfig
8import unittest
9from collections import namedtuple
10
11from test import support
12from test.support.script_helper import run_python_until_end
13
14
15# Set the list of ways we expect to be able to ask for the "C" locale
16EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
17
18# Set our expectation for the default encoding used in the C locale
19# for the filesystem encoding and the standard streams
20EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
21EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
22
23# Set our expectation for the default locale used when none is specified
24EXPECT_COERCION_IN_DEFAULT_LOCALE = True
25
26TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
27
28# Apply some platform dependent overrides
29if sys.platform == "android":
30    # Android defaults to using UTF-8 for all system interfaces
31    EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
32    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
33elif sys.platform.startswith("linux"):
34    # Linux distros typically alias the POSIX locale directly to the C
35    # locale.
36    # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
37    #       able to check this case unconditionally
38    EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
39elif sys.platform.startswith("aix"):
40    # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
41    EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
42    EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
43elif sys.platform == "darwin":
44    # FS encoding is UTF-8 on macOS
45    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
46elif sys.platform == "cygwin":
47    # Cygwin defaults to using C.UTF-8
48    # TODO: Work out a robust dynamic test for this that doesn't rely on
49    #       CPython's own locale handling machinery
50    EXPECT_COERCION_IN_DEFAULT_LOCALE = False
51elif sys.platform == "vxworks":
52    # VxWorks defaults to using UTF-8 for all system interfaces
53    EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
54    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
55
56# Note that the above expectations are still wrong in some cases, such as:
57# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
58# * Any platform other than AIX that uses latin-1 in the C locale
59# * Any Linux distro where POSIX isn't a simple alias for the C locale
60# * Any Linux distro where the default locale is something other than "C"
61#
62# Options for dealing with this:
63# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
64#   such platforms (e.g. it isn't set on Windows)
65# * Fix the test expectations to match the actual platform behaviour
66
67# In order to get the warning messages to match up as expected, the candidate
68# order here must much the target locale order in Python/pylifecycle.c
69_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
70
71# There's no reliable cross-platform way of checking locale alias
72# lists, so the only way of knowing which of these locales will work
73# is to try them with locale.setlocale(). We do that in a subprocess
74# in setUpModule() below to avoid altering the locale of the test runner.
75#
76# If the relevant locale module attributes exist, and we're not on a platform
77# where we expect it to always succeed, we also check that
78# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
79# will skip locale coercion for that particular target locale
80_check_nl_langinfo_CODESET = bool(
81    sys.platform not in ("darwin", "linux") and
82    hasattr(locale, "nl_langinfo") and
83    hasattr(locale, "CODESET")
84)
85
86def _set_locale_in_subprocess(locale_name):
87    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
88    if _check_nl_langinfo_CODESET:
89        # If there's no valid CODESET, we expect coercion to be skipped
90        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
91    cmd = cmd_fmt.format(locale_name)
92    result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
93    return result.rc == 0
94
95
96
97_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
98_EncodingDetails = namedtuple("EncodingDetails", _fields)
99
100class EncodingDetails(_EncodingDetails):
101    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
102    CHILD_PROCESS_SCRIPT = ";".join([
103        "import sys, os",
104        "print(sys.getfilesystemencoding())",
105        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
106        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
107        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
108        "print(os.environ.get('LANG', 'not set'))",
109        "print(os.environ.get('LC_CTYPE', 'not set'))",
110        "print(os.environ.get('LC_ALL', 'not set'))",
111    ])
112
113    @classmethod
114    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, stream_errors, env_vars):
115        """Returns expected child process details for a given encoding"""
116        _stream = stream_encoding + ":{}"
117        if stream_errors is None:
118            # stdin and stdout should use surrogateescape either because the
119            # coercion triggered, or because the C locale was detected
120            stream_errors = "surrogateescape"
121
122        stream_info = [_stream.format(stream_errors)] * 2
123
124        # stderr should always use backslashreplace
125        stream_info.append(_stream.format("backslashreplace"))
126        expected_lang = env_vars.get("LANG", "not set")
127        if coercion_expected:
128            expected_lc_ctype = CLI_COERCION_TARGET
129        else:
130            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
131        expected_lc_all = env_vars.get("LC_ALL", "not set")
132        env_info = expected_lang, expected_lc_ctype, expected_lc_all
133        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
134
135    @classmethod
136    def get_child_details(cls, env_vars):
137        """Retrieves fsencoding and standard stream details from a child process
138
139        Returns (encoding_details, stderr_lines):
140
141        - encoding_details: EncodingDetails for eager decoding
142        - stderr_lines: result of calling splitlines() on the stderr output
143
144        The child is run in isolated mode if the current interpreter supports
145        that.
146        """
147        result, py_cmd = run_python_until_end(
148            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
149            **env_vars
150        )
151        if not result.rc == 0:
152            result.fail(py_cmd)
153        # All subprocess outputs in this test case should be pure ASCII
154        stdout_lines = result.out.decode("ascii").splitlines()
155        child_encoding_details = dict(cls(*stdout_lines)._asdict())
156        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
157        return child_encoding_details, stderr_lines
158
159
160# Details of the shared library warning emitted at runtime
161LEGACY_LOCALE_WARNING = (
162    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
163    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
164    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
165    "locales is recommended."
166)
167
168# Details of the CLI locale coercion warning emitted at runtime
169CLI_COERCION_WARNING_FMT = (
170    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
171    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
172)
173
174
175AVAILABLE_TARGETS = None
176CLI_COERCION_TARGET = None
177CLI_COERCION_WARNING = None
178
179def setUpModule():
180    global AVAILABLE_TARGETS
181    global CLI_COERCION_TARGET
182    global CLI_COERCION_WARNING
183
184    if AVAILABLE_TARGETS is not None:
185        # initialization already done
186        return
187    AVAILABLE_TARGETS = []
188
189    # Find the target locales available in the current system
190    for target_locale in _C_UTF8_LOCALES:
191        if _set_locale_in_subprocess(target_locale):
192            AVAILABLE_TARGETS.append(target_locale)
193
194    if AVAILABLE_TARGETS:
195        # Coercion is expected to use the first available target locale
196        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
197        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
198
199    if support.verbose:
200        print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
201        print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
202        print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
203        print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
204        print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
205        print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
206        print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")
207
208
209class _LocaleHandlingTestCase(unittest.TestCase):
210    # Base class to check expected locale handling behaviour
211
212    def _check_child_encoding_details(self,
213                                      env_vars,
214                                      expected_fs_encoding,
215                                      expected_stream_encoding,
216                                      expected_stream_errors,
217                                      expected_warnings,
218                                      coercion_expected):
219        """Check the C locale handling for the given process environment
220
221        Parameters:
222            expected_fs_encoding: expected sys.getfilesystemencoding() result
223            expected_stream_encoding: expected encoding for standard streams
224            expected_warning: stderr output to expect (if any)
225        """
226        result = EncodingDetails.get_child_details(env_vars)
227        encoding_details, stderr_lines = result
228        expected_details = EncodingDetails.get_expected_details(
229            coercion_expected,
230            expected_fs_encoding,
231            expected_stream_encoding,
232            expected_stream_errors,
233            env_vars
234        )
235        self.assertEqual(encoding_details, expected_details)
236        if expected_warnings is None:
237            expected_warnings = []
238        self.assertEqual(stderr_lines, expected_warnings)
239
240
241class LocaleConfigurationTests(_LocaleHandlingTestCase):
242    # Test explicit external configuration via the process environment
243
244    @classmethod
245    def setUpClass(cls):
246        # This relies on setUpModule() having been run, so it can't be
247        # handled via the @unittest.skipUnless decorator
248        if not AVAILABLE_TARGETS:
249            raise unittest.SkipTest("No C-with-UTF-8 locale available")
250
251    def test_external_target_locale_configuration(self):
252
253        # Explicitly setting a target locale should give the same behaviour as
254        # is seen when implicitly coercing to that target locale
255        self.maxDiff = None
256
257        expected_fs_encoding = "utf-8"
258        expected_stream_encoding = "utf-8"
259
260        base_var_dict = {
261            "LANG": "",
262            "LC_CTYPE": "",
263            "LC_ALL": "",
264            "PYTHONCOERCECLOCALE": "",
265            "PYTHONIOENCODING": "",
266        }
267        for env_var in ("LANG", "LC_CTYPE"):
268            for locale_to_set in AVAILABLE_TARGETS:
269                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
270                #                 expected, so skip that combination for now
271                # See https://bugs.python.org/issue30672 for discussion
272                if env_var == "LANG" and locale_to_set == "UTF-8":
273                    continue
274
275                with self.subTest(env_var=env_var,
276                                  configured_locale=locale_to_set):
277                    var_dict = base_var_dict.copy()
278                    var_dict[env_var] = locale_to_set
279                    self._check_child_encoding_details(var_dict,
280                                                       expected_fs_encoding,
281                                                       expected_stream_encoding,
282                                                       expected_stream_errors=None,
283                                                       expected_warnings=None,
284                                                       coercion_expected=False)
285
286    def test_with_ioencoding(self):
287        # Explicitly setting a target locale should give the same behaviour as
288        # is seen when implicitly coercing to that target locale
289        self.maxDiff = None
290
291        expected_fs_encoding = "utf-8"
292        expected_stream_encoding = "utf-8"
293
294        base_var_dict = {
295            "LANG": "",
296            "LC_CTYPE": "",
297            "LC_ALL": "",
298            "PYTHONCOERCECLOCALE": "",
299            "PYTHONIOENCODING": "UTF-8",
300        }
301        for env_var in ("LANG", "LC_CTYPE"):
302            for locale_to_set in AVAILABLE_TARGETS:
303                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
304                #                 expected, so skip that combination for now
305                # See https://bugs.python.org/issue30672 for discussion
306                if env_var == "LANG" and locale_to_set == "UTF-8":
307                    continue
308
309                with self.subTest(env_var=env_var,
310                                  configured_locale=locale_to_set):
311                    var_dict = base_var_dict.copy()
312                    var_dict[env_var] = locale_to_set
313                    self._check_child_encoding_details(var_dict,
314                                                       expected_fs_encoding,
315                                                       expected_stream_encoding,
316                                                       expected_stream_errors="strict",
317                                                       expected_warnings=None,
318                                                       coercion_expected=False)
319
320@support.cpython_only
321@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
322                     "C locale coercion disabled at build time")
323class LocaleCoercionTests(_LocaleHandlingTestCase):
324    # Test implicit reconfiguration of the environment during CLI startup
325
326    def _check_c_locale_coercion(self,
327                                 fs_encoding, stream_encoding,
328                                 coerce_c_locale,
329                                 expected_warnings=None,
330                                 coercion_expected=True,
331                                 **extra_vars):
332        """Check the C locale handling for various configurations
333
334        Parameters:
335            fs_encoding: expected sys.getfilesystemencoding() result
336            stream_encoding: expected encoding for standard streams
337            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
338              None: don't set the variable at all
339              str: the value set in the child's environment
340            expected_warnings: expected warning lines on stderr
341            extra_vars: additional environment variables to set in subprocess
342        """
343        self.maxDiff = None
344
345        if not AVAILABLE_TARGETS:
346            # Locale coercion is disabled when there aren't any target locales
347            fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
348            stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
349            coercion_expected = False
350            if expected_warnings:
351                expected_warnings = [LEGACY_LOCALE_WARNING]
352
353        base_var_dict = {
354            "LANG": "",
355            "LC_CTYPE": "",
356            "LC_ALL": "",
357            "PYTHONCOERCECLOCALE": "",
358            "PYTHONIOENCODING": "",
359        }
360        base_var_dict.update(extra_vars)
361        if coerce_c_locale is not None:
362            base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
363
364        # Check behaviour for the default locale
365        with self.subTest(default_locale=True,
366                          PYTHONCOERCECLOCALE=coerce_c_locale):
367            if EXPECT_COERCION_IN_DEFAULT_LOCALE:
368                _expected_warnings = expected_warnings
369                _coercion_expected = coercion_expected
370            else:
371                _expected_warnings = None
372                _coercion_expected = False
373            # On Android CLI_COERCION_WARNING is not printed when all the
374            # locale environment variables are undefined or empty. When
375            # this code path is run with environ['LC_ALL'] == 'C', then
376            # LEGACY_LOCALE_WARNING is printed.
377            if (support.is_android and
378                    _expected_warnings == [CLI_COERCION_WARNING]):
379                _expected_warnings = None
380            self._check_child_encoding_details(base_var_dict,
381                                               fs_encoding,
382                                               stream_encoding,
383                                               None,
384                                               _expected_warnings,
385                                               _coercion_expected)
386
387        # Check behaviour for explicitly configured locales
388        for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
389            for env_var in ("LANG", "LC_CTYPE"):
390                with self.subTest(env_var=env_var,
391                                  nominal_locale=locale_to_set,
392                                  PYTHONCOERCECLOCALE=coerce_c_locale,
393                                  PYTHONIOENCODING=""):
394                    var_dict = base_var_dict.copy()
395                    var_dict[env_var] = locale_to_set
396                    # Check behaviour on successful coercion
397                    self._check_child_encoding_details(var_dict,
398                                                       fs_encoding,
399                                                       stream_encoding,
400                                                       None,
401                                                       expected_warnings,
402                                                       coercion_expected)
403
404    def test_PYTHONCOERCECLOCALE_not_set(self):
405        # This should coerce to the first available target locale by default
406        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
407
408    def test_PYTHONCOERCECLOCALE_not_zero(self):
409        # *Any* string other than "0" is considered "set" for our purposes
410        # and hence should result in the locale coercion being enabled
411        for setting in ("", "1", "true", "false"):
412            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
413
414    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
415        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
416        self._check_c_locale_coercion("utf-8", "utf-8",
417                                      coerce_c_locale="warn",
418                                      expected_warnings=[CLI_COERCION_WARNING])
419
420
421    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
422        # The setting "0" should result in the locale coercion being disabled
423        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
424                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
425                                      coerce_c_locale="0",
426                                      coercion_expected=False)
427        # Setting LC_ALL=C shouldn't make any difference to the behaviour
428        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
429                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
430                                      coerce_c_locale="0",
431                                      LC_ALL="C",
432                                      coercion_expected=False)
433
434    def test_LC_ALL_set_to_C(self):
435        # Setting LC_ALL should render the locale coercion ineffective
436        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
437                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
438                                      coerce_c_locale=None,
439                                      LC_ALL="C",
440                                      coercion_expected=False)
441        # And result in a warning about a lack of locale compatibility
442        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
443                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
444                                      coerce_c_locale="warn",
445                                      LC_ALL="C",
446                                      expected_warnings=[LEGACY_LOCALE_WARNING],
447                                      coercion_expected=False)
448
449    def test_PYTHONCOERCECLOCALE_set_to_one(self):
450        # skip the test if the LC_CTYPE locale is C or coerced
451        old_loc = locale.setlocale(locale.LC_CTYPE, None)
452        self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
453        try:
454            loc = locale.setlocale(locale.LC_CTYPE, "")
455        except locale.Error as e:
456            self.skipTest(str(e))
457        if loc == "C":
458            self.skipTest("test requires LC_CTYPE locale different than C")
459        if loc in TARGET_LOCALES :
460            self.skipTest("coerced LC_CTYPE locale: %s" % loc)
461
462        # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
463        # if it's not equal to "C"
464        code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
465        env = dict(os.environ, PYTHONCOERCECLOCALE='1')
466        cmd = subprocess.run([sys.executable, '-c', code],
467                             stdout=subprocess.PIPE,
468                             env=env,
469                             text=True)
470        self.assertEqual(cmd.stdout.rstrip(), loc)
471
472
473def tearDownModule():
474    support.reap_children()
475
476
477if __name__ == "__main__":
478    unittest.main()
479