• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
2
3import locale
4import os
5import subprocess
6import sys
7import sysconfig
8import unittest
9from collections import namedtuple
10
11from test import support
12from test.support.script_helper import run_python_until_end
13
14
15# Set the list of ways we expect to be able to ask for the "C" locale
16EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
17
18# Set our expectation for the default encoding used in the C locale
19# for the filesystem encoding and the standard streams
20EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
21EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
22
23# Set our expectation for the default locale used when none is specified
24EXPECT_COERCION_IN_DEFAULT_LOCALE = True
25
26TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
27
28# Apply some platform dependent overrides
29if sys.platform.startswith("linux"):
30    if support.is_android:
31        # Android defaults to using UTF-8 for all system interfaces
32        EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
33        EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
34    else:
35        # Linux distros typically alias the POSIX locale directly to the C
36        # locale.
37        # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
38        #       able to check this case unconditionally
39        EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
40elif sys.platform.startswith("aix"):
41    # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
42    EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
43    EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
44elif sys.platform == "darwin":
45    # FS encoding is UTF-8 on macOS
46    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
47elif sys.platform == "cygwin":
48    # Cygwin defaults to using C.UTF-8
49    # TODO: Work out a robust dynamic test for this that doesn't rely on
50    #       CPython's own locale handling machinery
51    EXPECT_COERCION_IN_DEFAULT_LOCALE = False
52
53# Note that the above expectations are still wrong in some cases, such as:
54# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
55# * Any platform other than AIX that uses latin-1 in the C locale
56# * Any Linux distro where POSIX isn't a simple alias for the C locale
57# * Any Linux distro where the default locale is something other than "C"
58#
59# Options for dealing with this:
60# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
61#   such platforms (e.g. it isn't set on Windows)
62# * Fix the test expectations to match the actual platform behaviour
63
64# In order to get the warning messages to match up as expected, the candidate
65# order here must much the target locale order in Python/pylifecycle.c
66_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
67
68# There's no reliable cross-platform way of checking locale alias
69# lists, so the only way of knowing which of these locales will work
70# is to try them with locale.setlocale(). We do that in a subprocess
71# in setUpModule() below to avoid altering the locale of the test runner.
72#
73# If the relevant locale module attributes exist, and we're not on a platform
74# where we expect it to always succeed, we also check that
75# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
76# will skip locale coercion for that particular target locale
77_check_nl_langinfo_CODESET = bool(
78    sys.platform not in ("darwin", "linux") and
79    hasattr(locale, "nl_langinfo") and
80    hasattr(locale, "CODESET")
81)
82
83def _set_locale_in_subprocess(locale_name):
84    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
85    if _check_nl_langinfo_CODESET:
86        # If there's no valid CODESET, we expect coercion to be skipped
87        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
88    cmd = cmd_fmt.format(locale_name)
89    result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
90    return result.rc == 0
91
92
93
94_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
95_EncodingDetails = namedtuple("EncodingDetails", _fields)
96
97class EncodingDetails(_EncodingDetails):
98    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
99    CHILD_PROCESS_SCRIPT = ";".join([
100        "import sys, os",
101        "print(sys.getfilesystemencoding())",
102        "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
103        "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
104        "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
105        "print(os.environ.get('LANG', 'not set'))",
106        "print(os.environ.get('LC_CTYPE', 'not set'))",
107        "print(os.environ.get('LC_ALL', 'not set'))",
108    ])
109
110    @classmethod
111    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
112        """Returns expected child process details for a given encoding"""
113        _stream = stream_encoding + ":{}"
114        # stdin and stdout should use surrogateescape either because the
115        # coercion triggered, or because the C locale was detected
116        stream_info = 2*[_stream.format("surrogateescape")]
117        # stderr should always use backslashreplace
118        stream_info.append(_stream.format("backslashreplace"))
119        expected_lang = env_vars.get("LANG", "not set")
120        if coercion_expected:
121            expected_lc_ctype = CLI_COERCION_TARGET
122        else:
123            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
124        expected_lc_all = env_vars.get("LC_ALL", "not set")
125        env_info = expected_lang, expected_lc_ctype, expected_lc_all
126        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
127
128    @classmethod
129    def get_child_details(cls, env_vars):
130        """Retrieves fsencoding and standard stream details from a child process
131
132        Returns (encoding_details, stderr_lines):
133
134        - encoding_details: EncodingDetails for eager decoding
135        - stderr_lines: result of calling splitlines() on the stderr output
136
137        The child is run in isolated mode if the current interpreter supports
138        that.
139        """
140        result, py_cmd = run_python_until_end(
141            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
142            **env_vars
143        )
144        if not result.rc == 0:
145            result.fail(py_cmd)
146        # All subprocess outputs in this test case should be pure ASCII
147        stdout_lines = result.out.decode("ascii").splitlines()
148        child_encoding_details = dict(cls(*stdout_lines)._asdict())
149        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
150        return child_encoding_details, stderr_lines
151
152
153# Details of the shared library warning emitted at runtime
154LEGACY_LOCALE_WARNING = (
155    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
156    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
157    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
158    "locales is recommended."
159)
160
161# Details of the CLI locale coercion warning emitted at runtime
162CLI_COERCION_WARNING_FMT = (
163    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
164    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
165)
166
167
168AVAILABLE_TARGETS = None
169CLI_COERCION_TARGET = None
170CLI_COERCION_WARNING = None
171
172def setUpModule():
173    global AVAILABLE_TARGETS
174    global CLI_COERCION_TARGET
175    global CLI_COERCION_WARNING
176
177    if AVAILABLE_TARGETS is not None:
178        # initialization already done
179        return
180    AVAILABLE_TARGETS = []
181
182    # Find the target locales available in the current system
183    for target_locale in _C_UTF8_LOCALES:
184        if _set_locale_in_subprocess(target_locale):
185            AVAILABLE_TARGETS.append(target_locale)
186
187    if AVAILABLE_TARGETS:
188        # Coercion is expected to use the first available target locale
189        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
190        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
191
192    if support.verbose:
193        print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
194        print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
195        print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
196        print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
197        print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
198        print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
199        print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")
200
201
202class _LocaleHandlingTestCase(unittest.TestCase):
203    # Base class to check expected locale handling behaviour
204
205    def _check_child_encoding_details(self,
206                                      env_vars,
207                                      expected_fs_encoding,
208                                      expected_stream_encoding,
209                                      expected_warnings,
210                                      coercion_expected):
211        """Check the C locale handling for the given process environment
212
213        Parameters:
214            expected_fs_encoding: expected sys.getfilesystemencoding() result
215            expected_stream_encoding: expected encoding for standard streams
216            expected_warning: stderr output to expect (if any)
217        """
218        result = EncodingDetails.get_child_details(env_vars)
219        encoding_details, stderr_lines = result
220        expected_details = EncodingDetails.get_expected_details(
221            coercion_expected,
222            expected_fs_encoding,
223            expected_stream_encoding,
224            env_vars
225        )
226        self.assertEqual(encoding_details, expected_details)
227        if expected_warnings is None:
228            expected_warnings = []
229        self.assertEqual(stderr_lines, expected_warnings)
230
231
232class LocaleConfigurationTests(_LocaleHandlingTestCase):
233    # Test explicit external configuration via the process environment
234
235    @classmethod
236    def setUpClass(cls):
237        # This relies on setUpModule() having been run, so it can't be
238        # handled via the @unittest.skipUnless decorator
239        if not AVAILABLE_TARGETS:
240            raise unittest.SkipTest("No C-with-UTF-8 locale available")
241
242    def test_external_target_locale_configuration(self):
243
244        # Explicitly setting a target locale should give the same behaviour as
245        # is seen when implicitly coercing to that target locale
246        self.maxDiff = None
247
248        expected_fs_encoding = "utf-8"
249        expected_stream_encoding = "utf-8"
250
251        base_var_dict = {
252            "LANG": "",
253            "LC_CTYPE": "",
254            "LC_ALL": "",
255            "PYTHONCOERCECLOCALE": "",
256        }
257        for env_var in ("LANG", "LC_CTYPE"):
258            for locale_to_set in AVAILABLE_TARGETS:
259                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
260                #                 expected, so skip that combination for now
261                # See https://bugs.python.org/issue30672 for discussion
262                if env_var == "LANG" and locale_to_set == "UTF-8":
263                    continue
264
265                with self.subTest(env_var=env_var,
266                                  configured_locale=locale_to_set):
267                    var_dict = base_var_dict.copy()
268                    var_dict[env_var] = locale_to_set
269                    self._check_child_encoding_details(var_dict,
270                                                       expected_fs_encoding,
271                                                       expected_stream_encoding,
272                                                       expected_warnings=None,
273                                                       coercion_expected=False)
274
275
276
277@support.cpython_only
278@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
279                     "C locale coercion disabled at build time")
280class LocaleCoercionTests(_LocaleHandlingTestCase):
281    # Test implicit reconfiguration of the environment during CLI startup
282
283    def _check_c_locale_coercion(self,
284                                 fs_encoding, stream_encoding,
285                                 coerce_c_locale,
286                                 expected_warnings=None,
287                                 coercion_expected=True,
288                                 **extra_vars):
289        """Check the C locale handling for various configurations
290
291        Parameters:
292            fs_encoding: expected sys.getfilesystemencoding() result
293            stream_encoding: expected encoding for standard streams
294            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
295              None: don't set the variable at all
296              str: the value set in the child's environment
297            expected_warnings: expected warning lines on stderr
298            extra_vars: additional environment variables to set in subprocess
299        """
300        self.maxDiff = None
301
302        if not AVAILABLE_TARGETS:
303            # Locale coercion is disabled when there aren't any target locales
304            fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
305            stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
306            coercion_expected = False
307            if expected_warnings:
308                expected_warnings = [LEGACY_LOCALE_WARNING]
309
310        base_var_dict = {
311            "LANG": "",
312            "LC_CTYPE": "",
313            "LC_ALL": "",
314            "PYTHONCOERCECLOCALE": "",
315        }
316        base_var_dict.update(extra_vars)
317        if coerce_c_locale is not None:
318            base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
319
320        # Check behaviour for the default locale
321        with self.subTest(default_locale=True,
322                          PYTHONCOERCECLOCALE=coerce_c_locale):
323            if EXPECT_COERCION_IN_DEFAULT_LOCALE:
324                _expected_warnings = expected_warnings
325                _coercion_expected = coercion_expected
326            else:
327                _expected_warnings = None
328                _coercion_expected = False
329            # On Android CLI_COERCION_WARNING is not printed when all the
330            # locale environment variables are undefined or empty. When
331            # this code path is run with environ['LC_ALL'] == 'C', then
332            # LEGACY_LOCALE_WARNING is printed.
333            if (support.is_android and
334                    _expected_warnings == [CLI_COERCION_WARNING]):
335                _expected_warnings = None
336            self._check_child_encoding_details(base_var_dict,
337                                               fs_encoding,
338                                               stream_encoding,
339                                               _expected_warnings,
340                                               _coercion_expected)
341
342        # Check behaviour for explicitly configured locales
343        for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
344            for env_var in ("LANG", "LC_CTYPE"):
345                with self.subTest(env_var=env_var,
346                                  nominal_locale=locale_to_set,
347                                  PYTHONCOERCECLOCALE=coerce_c_locale):
348                    var_dict = base_var_dict.copy()
349                    var_dict[env_var] = locale_to_set
350                    # Check behaviour on successful coercion
351                    self._check_child_encoding_details(var_dict,
352                                                       fs_encoding,
353                                                       stream_encoding,
354                                                       expected_warnings,
355                                                       coercion_expected)
356
357    def test_PYTHONCOERCECLOCALE_not_set(self):
358        # This should coerce to the first available target locale by default
359        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
360
361    def test_PYTHONCOERCECLOCALE_not_zero(self):
362        # *Any* string other than "0" is considered "set" for our purposes
363        # and hence should result in the locale coercion being enabled
364        for setting in ("", "1", "true", "false"):
365            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
366
367    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
368        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
369        self._check_c_locale_coercion("utf-8", "utf-8",
370                                      coerce_c_locale="warn",
371                                      expected_warnings=[CLI_COERCION_WARNING])
372
373
374    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
375        # The setting "0" should result in the locale coercion being disabled
376        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
377                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
378                                      coerce_c_locale="0",
379                                      coercion_expected=False)
380        # Setting LC_ALL=C shouldn't make any difference to the behaviour
381        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
382                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
383                                      coerce_c_locale="0",
384                                      LC_ALL="C",
385                                      coercion_expected=False)
386
387    def test_LC_ALL_set_to_C(self):
388        # Setting LC_ALL should render the locale coercion ineffective
389        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
390                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
391                                      coerce_c_locale=None,
392                                      LC_ALL="C",
393                                      coercion_expected=False)
394        # And result in a warning about a lack of locale compatibility
395        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
396                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
397                                      coerce_c_locale="warn",
398                                      LC_ALL="C",
399                                      expected_warnings=[LEGACY_LOCALE_WARNING],
400                                      coercion_expected=False)
401
402    def test_PYTHONCOERCECLOCALE_set_to_one(self):
403        # skip the test if the LC_CTYPE locale is C or coerced
404        old_loc = locale.setlocale(locale.LC_CTYPE, None)
405        self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
406        loc = locale.setlocale(locale.LC_CTYPE, "")
407        if loc == "C":
408            self.skipTest("test requires LC_CTYPE locale different than C")
409        if loc in TARGET_LOCALES :
410            self.skipTest("coerced LC_CTYPE locale: %s" % loc)
411
412        # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
413        # if it's not equal to "C"
414        code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
415        env = dict(os.environ, PYTHONCOERCECLOCALE='1')
416        cmd = subprocess.run([sys.executable, '-c', code],
417                             stdout=subprocess.PIPE,
418                             env=env,
419                             text=True)
420        self.assertEqual(cmd.stdout.rstrip(), loc)
421
422
423def test_main():
424    support.run_unittest(
425        LocaleConfigurationTests,
426        LocaleCoercionTests
427    )
428    support.reap_children()
429
430if __name__ == "__main__":
431    test_main()
432