1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale 2 3import locale 4import os 5import subprocess 6import sys 7import sysconfig 8import unittest 9from collections import namedtuple 10 11from test import support 12from test.support.script_helper import run_python_until_end 13 14 15# Set the list of ways we expect to be able to ask for the "C" locale 16EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"] 17 18# Set our expectation for the default encoding used in the C locale 19# for the filesystem encoding and the standard streams 20EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii" 21EXPECTED_C_LOCALE_FS_ENCODING = "ascii" 22 23# Set our expectation for the default locale used when none is specified 24EXPECT_COERCION_IN_DEFAULT_LOCALE = True 25 26TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"] 27 28# Apply some platform dependent overrides 29if sys.platform == "android": 30 # Android defaults to using UTF-8 for all system interfaces 31 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8" 32 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 33elif sys.platform.startswith("linux"): 34 # Linux distros typically alias the POSIX locale directly to the C 35 # locale. 36 # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be 37 # able to check this case unconditionally 38 EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX") 39elif sys.platform.startswith("aix"): 40 # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII 41 EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1" 42 EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1" 43elif sys.platform == "darwin": 44 # FS encoding is UTF-8 on macOS 45 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 46elif sys.platform == "cygwin": 47 # Cygwin defaults to using C.UTF-8 48 # TODO: Work out a robust dynamic test for this that doesn't rely on 49 # CPython's own locale handling machinery 50 EXPECT_COERCION_IN_DEFAULT_LOCALE = False 51elif sys.platform == "vxworks": 52 # VxWorks defaults to using UTF-8 for all system interfaces 53 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8" 54 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 55 56# Note that the above expectations are still wrong in some cases, such as: 57# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set 58# * Any platform other than AIX that uses latin-1 in the C locale 59# * Any Linux distro where POSIX isn't a simple alias for the C locale 60# * Any Linux distro where the default locale is something other than "C" 61# 62# Options for dealing with this: 63# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on 64# such platforms (e.g. it isn't set on Windows) 65# * Fix the test expectations to match the actual platform behaviour 66 67# In order to get the warning messages to match up as expected, the candidate 68# order here must much the target locale order in Python/pylifecycle.c 69_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8") 70 71# There's no reliable cross-platform way of checking locale alias 72# lists, so the only way of knowing which of these locales will work 73# is to try them with locale.setlocale(). We do that in a subprocess 74# in setUpModule() below to avoid altering the locale of the test runner. 75# 76# If the relevant locale module attributes exist, and we're not on a platform 77# where we expect it to always succeed, we also check that 78# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter 79# will skip locale coercion for that particular target locale 80_check_nl_langinfo_CODESET = bool( 81 sys.platform not in ("darwin", "linux") and 82 hasattr(locale, "nl_langinfo") and 83 hasattr(locale, "CODESET") 84) 85 86def _set_locale_in_subprocess(locale_name): 87 cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))" 88 if _check_nl_langinfo_CODESET: 89 # If there's no valid CODESET, we expect coercion to be skipped 90 cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))" 91 cmd = cmd_fmt.format(locale_name) 92 result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='') 93 return result.rc == 0 94 95 96 97_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all" 98_EncodingDetails = namedtuple("EncodingDetails", _fields) 99 100class EncodingDetails(_EncodingDetails): 101 # XXX (ncoghlan): Using JSON for child state reporting may be less fragile 102 CHILD_PROCESS_SCRIPT = ";".join([ 103 "import sys, os", 104 "print(sys.getfilesystemencoding())", 105 "print(sys.stdin.encoding + ':' + sys.stdin.errors)", 106 "print(sys.stdout.encoding + ':' + sys.stdout.errors)", 107 "print(sys.stderr.encoding + ':' + sys.stderr.errors)", 108 "print(os.environ.get('LANG', 'not set'))", 109 "print(os.environ.get('LC_CTYPE', 'not set'))", 110 "print(os.environ.get('LC_ALL', 'not set'))", 111 ]) 112 113 @classmethod 114 def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, stream_errors, env_vars): 115 """Returns expected child process details for a given encoding""" 116 _stream = stream_encoding + ":{}" 117 if stream_errors is None: 118 # stdin and stdout should use surrogateescape either because the 119 # coercion triggered, or because the C locale was detected 120 stream_errors = "surrogateescape" 121 122 stream_info = [_stream.format(stream_errors)] * 2 123 124 # stderr should always use backslashreplace 125 stream_info.append(_stream.format("backslashreplace")) 126 expected_lang = env_vars.get("LANG", "not set") 127 if coercion_expected: 128 expected_lc_ctype = CLI_COERCION_TARGET 129 else: 130 expected_lc_ctype = env_vars.get("LC_CTYPE", "not set") 131 expected_lc_all = env_vars.get("LC_ALL", "not set") 132 env_info = expected_lang, expected_lc_ctype, expected_lc_all 133 return dict(cls(fs_encoding, *stream_info, *env_info)._asdict()) 134 135 @classmethod 136 def get_child_details(cls, env_vars): 137 """Retrieves fsencoding and standard stream details from a child process 138 139 Returns (encoding_details, stderr_lines): 140 141 - encoding_details: EncodingDetails for eager decoding 142 - stderr_lines: result of calling splitlines() on the stderr output 143 144 The child is run in isolated mode if the current interpreter supports 145 that. 146 """ 147 result, py_cmd = run_python_until_end( 148 "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, 149 **env_vars 150 ) 151 if not result.rc == 0: 152 result.fail(py_cmd) 153 # All subprocess outputs in this test case should be pure ASCII 154 stdout_lines = result.out.decode("ascii").splitlines() 155 child_encoding_details = dict(cls(*stdout_lines)._asdict()) 156 stderr_lines = result.err.decode("ascii").rstrip().splitlines() 157 return child_encoding_details, stderr_lines 158 159 160# Details of the shared library warning emitted at runtime 161LEGACY_LOCALE_WARNING = ( 162 "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " 163 "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " 164 "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " 165 "locales is recommended." 166) 167 168# Details of the CLI locale coercion warning emitted at runtime 169CLI_COERCION_WARNING_FMT = ( 170 "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " 171 "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." 172) 173 174 175AVAILABLE_TARGETS = None 176CLI_COERCION_TARGET = None 177CLI_COERCION_WARNING = None 178 179def setUpModule(): 180 global AVAILABLE_TARGETS 181 global CLI_COERCION_TARGET 182 global CLI_COERCION_WARNING 183 184 if AVAILABLE_TARGETS is not None: 185 # initialization already done 186 return 187 AVAILABLE_TARGETS = [] 188 189 # Find the target locales available in the current system 190 for target_locale in _C_UTF8_LOCALES: 191 if _set_locale_in_subprocess(target_locale): 192 AVAILABLE_TARGETS.append(target_locale) 193 194 if AVAILABLE_TARGETS: 195 # Coercion is expected to use the first available target locale 196 CLI_COERCION_TARGET = AVAILABLE_TARGETS[0] 197 CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET) 198 199 if support.verbose: 200 print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}") 201 print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}") 202 print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}") 203 print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}") 204 print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}") 205 print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}") 206 print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}") 207 208 209class _LocaleHandlingTestCase(unittest.TestCase): 210 # Base class to check expected locale handling behaviour 211 212 def _check_child_encoding_details(self, 213 env_vars, 214 expected_fs_encoding, 215 expected_stream_encoding, 216 expected_stream_errors, 217 expected_warnings, 218 coercion_expected): 219 """Check the C locale handling for the given process environment 220 221 Parameters: 222 expected_fs_encoding: expected sys.getfilesystemencoding() result 223 expected_stream_encoding: expected encoding for standard streams 224 expected_warning: stderr output to expect (if any) 225 """ 226 result = EncodingDetails.get_child_details(env_vars) 227 encoding_details, stderr_lines = result 228 expected_details = EncodingDetails.get_expected_details( 229 coercion_expected, 230 expected_fs_encoding, 231 expected_stream_encoding, 232 expected_stream_errors, 233 env_vars 234 ) 235 self.assertEqual(encoding_details, expected_details) 236 if expected_warnings is None: 237 expected_warnings = [] 238 self.assertEqual(stderr_lines, expected_warnings) 239 240 241class LocaleConfigurationTests(_LocaleHandlingTestCase): 242 # Test explicit external configuration via the process environment 243 244 @classmethod 245 def setUpClass(cls): 246 # This relies on setUpModule() having been run, so it can't be 247 # handled via the @unittest.skipUnless decorator 248 if not AVAILABLE_TARGETS: 249 raise unittest.SkipTest("No C-with-UTF-8 locale available") 250 251 def test_external_target_locale_configuration(self): 252 253 # Explicitly setting a target locale should give the same behaviour as 254 # is seen when implicitly coercing to that target locale 255 self.maxDiff = None 256 257 expected_fs_encoding = "utf-8" 258 expected_stream_encoding = "utf-8" 259 260 base_var_dict = { 261 "LANG": "", 262 "LC_CTYPE": "", 263 "LC_ALL": "", 264 "PYTHONCOERCECLOCALE": "", 265 "PYTHONIOENCODING": "", 266 } 267 for env_var in ("LANG", "LC_CTYPE"): 268 for locale_to_set in AVAILABLE_TARGETS: 269 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as 270 # expected, so skip that combination for now 271 # See https://bugs.python.org/issue30672 for discussion 272 if env_var == "LANG" and locale_to_set == "UTF-8": 273 continue 274 275 with self.subTest(env_var=env_var, 276 configured_locale=locale_to_set): 277 var_dict = base_var_dict.copy() 278 var_dict[env_var] = locale_to_set 279 self._check_child_encoding_details(var_dict, 280 expected_fs_encoding, 281 expected_stream_encoding, 282 expected_stream_errors=None, 283 expected_warnings=None, 284 coercion_expected=False) 285 286 def test_with_ioencoding(self): 287 # Explicitly setting a target locale should give the same behaviour as 288 # is seen when implicitly coercing to that target locale 289 self.maxDiff = None 290 291 expected_fs_encoding = "utf-8" 292 expected_stream_encoding = "utf-8" 293 294 base_var_dict = { 295 "LANG": "", 296 "LC_CTYPE": "", 297 "LC_ALL": "", 298 "PYTHONCOERCECLOCALE": "", 299 "PYTHONIOENCODING": "UTF-8", 300 } 301 for env_var in ("LANG", "LC_CTYPE"): 302 for locale_to_set in AVAILABLE_TARGETS: 303 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as 304 # expected, so skip that combination for now 305 # See https://bugs.python.org/issue30672 for discussion 306 if env_var == "LANG" and locale_to_set == "UTF-8": 307 continue 308 309 with self.subTest(env_var=env_var, 310 configured_locale=locale_to_set): 311 var_dict = base_var_dict.copy() 312 var_dict[env_var] = locale_to_set 313 self._check_child_encoding_details(var_dict, 314 expected_fs_encoding, 315 expected_stream_encoding, 316 expected_stream_errors="strict", 317 expected_warnings=None, 318 coercion_expected=False) 319 320@support.cpython_only 321@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), 322 "C locale coercion disabled at build time") 323class LocaleCoercionTests(_LocaleHandlingTestCase): 324 # Test implicit reconfiguration of the environment during CLI startup 325 326 def _check_c_locale_coercion(self, 327 fs_encoding, stream_encoding, 328 coerce_c_locale, 329 expected_warnings=None, 330 coercion_expected=True, 331 **extra_vars): 332 """Check the C locale handling for various configurations 333 334 Parameters: 335 fs_encoding: expected sys.getfilesystemencoding() result 336 stream_encoding: expected encoding for standard streams 337 coerce_c_locale: setting to use for PYTHONCOERCECLOCALE 338 None: don't set the variable at all 339 str: the value set in the child's environment 340 expected_warnings: expected warning lines on stderr 341 extra_vars: additional environment variables to set in subprocess 342 """ 343 self.maxDiff = None 344 345 if not AVAILABLE_TARGETS: 346 # Locale coercion is disabled when there aren't any target locales 347 fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING 348 stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING 349 coercion_expected = False 350 if expected_warnings: 351 expected_warnings = [LEGACY_LOCALE_WARNING] 352 353 base_var_dict = { 354 "LANG": "", 355 "LC_CTYPE": "", 356 "LC_ALL": "", 357 "PYTHONCOERCECLOCALE": "", 358 "PYTHONIOENCODING": "", 359 } 360 base_var_dict.update(extra_vars) 361 if coerce_c_locale is not None: 362 base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale 363 364 # Check behaviour for the default locale 365 with self.subTest(default_locale=True, 366 PYTHONCOERCECLOCALE=coerce_c_locale): 367 if EXPECT_COERCION_IN_DEFAULT_LOCALE: 368 _expected_warnings = expected_warnings 369 _coercion_expected = coercion_expected 370 else: 371 _expected_warnings = None 372 _coercion_expected = False 373 # On Android CLI_COERCION_WARNING is not printed when all the 374 # locale environment variables are undefined or empty. When 375 # this code path is run with environ['LC_ALL'] == 'C', then 376 # LEGACY_LOCALE_WARNING is printed. 377 if (support.is_android and 378 _expected_warnings == [CLI_COERCION_WARNING]): 379 _expected_warnings = None 380 self._check_child_encoding_details(base_var_dict, 381 fs_encoding, 382 stream_encoding, 383 None, 384 _expected_warnings, 385 _coercion_expected) 386 387 # Check behaviour for explicitly configured locales 388 for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS: 389 for env_var in ("LANG", "LC_CTYPE"): 390 with self.subTest(env_var=env_var, 391 nominal_locale=locale_to_set, 392 PYTHONCOERCECLOCALE=coerce_c_locale, 393 PYTHONIOENCODING=""): 394 var_dict = base_var_dict.copy() 395 var_dict[env_var] = locale_to_set 396 # Check behaviour on successful coercion 397 self._check_child_encoding_details(var_dict, 398 fs_encoding, 399 stream_encoding, 400 None, 401 expected_warnings, 402 coercion_expected) 403 404 def test_PYTHONCOERCECLOCALE_not_set(self): 405 # This should coerce to the first available target locale by default 406 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None) 407 408 def test_PYTHONCOERCECLOCALE_not_zero(self): 409 # *Any* string other than "0" is considered "set" for our purposes 410 # and hence should result in the locale coercion being enabled 411 for setting in ("", "1", "true", "false"): 412 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting) 413 414 def test_PYTHONCOERCECLOCALE_set_to_warn(self): 415 # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales 416 self._check_c_locale_coercion("utf-8", "utf-8", 417 coerce_c_locale="warn", 418 expected_warnings=[CLI_COERCION_WARNING]) 419 420 421 def test_PYTHONCOERCECLOCALE_set_to_zero(self): 422 # The setting "0" should result in the locale coercion being disabled 423 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 424 EXPECTED_C_LOCALE_STREAM_ENCODING, 425 coerce_c_locale="0", 426 coercion_expected=False) 427 # Setting LC_ALL=C shouldn't make any difference to the behaviour 428 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 429 EXPECTED_C_LOCALE_STREAM_ENCODING, 430 coerce_c_locale="0", 431 LC_ALL="C", 432 coercion_expected=False) 433 434 def test_LC_ALL_set_to_C(self): 435 # Setting LC_ALL should render the locale coercion ineffective 436 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 437 EXPECTED_C_LOCALE_STREAM_ENCODING, 438 coerce_c_locale=None, 439 LC_ALL="C", 440 coercion_expected=False) 441 # And result in a warning about a lack of locale compatibility 442 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 443 EXPECTED_C_LOCALE_STREAM_ENCODING, 444 coerce_c_locale="warn", 445 LC_ALL="C", 446 expected_warnings=[LEGACY_LOCALE_WARNING], 447 coercion_expected=False) 448 449 def test_PYTHONCOERCECLOCALE_set_to_one(self): 450 # skip the test if the LC_CTYPE locale is C or coerced 451 old_loc = locale.setlocale(locale.LC_CTYPE, None) 452 self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc) 453 try: 454 loc = locale.setlocale(locale.LC_CTYPE, "") 455 except locale.Error as e: 456 self.skipTest(str(e)) 457 if loc == "C": 458 self.skipTest("test requires LC_CTYPE locale different than C") 459 if loc in TARGET_LOCALES : 460 self.skipTest("coerced LC_CTYPE locale: %s" % loc) 461 462 # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale 463 # if it's not equal to "C" 464 code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))' 465 env = dict(os.environ, PYTHONCOERCECLOCALE='1') 466 cmd = subprocess.run([sys.executable, '-c', code], 467 stdout=subprocess.PIPE, 468 env=env, 469 text=True) 470 self.assertEqual(cmd.stdout.rstrip(), loc) 471 472 473def tearDownModule(): 474 support.reap_children() 475 476 477if __name__ == "__main__": 478 unittest.main() 479