1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale 2 3import locale 4import os 5import shutil 6import subprocess 7import sys 8import sysconfig 9import unittest 10from collections import namedtuple 11 12import test.support 13from test.support.script_helper import ( 14 run_python_until_end, 15 interpreter_requires_environment, 16) 17 18# Set the list of ways we expect to be able to ask for the "C" locale 19EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"] 20 21# Set our expectation for the default encoding used in the C locale 22# for the filesystem encoding and the standard streams 23EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii" 24EXPECTED_C_LOCALE_FS_ENCODING = "ascii" 25 26# Set our expectation for the default locale used when none is specified 27EXPECT_COERCION_IN_DEFAULT_LOCALE = True 28 29TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"] 30 31# Apply some platform dependent overrides 32if sys.platform.startswith("linux"): 33 if test.support.is_android: 34 # Android defaults to using UTF-8 for all system interfaces 35 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8" 36 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 37 else: 38 # Linux distros typically alias the POSIX locale directly to the C 39 # locale. 40 # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be 41 # able to check this case unconditionally 42 EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX") 43elif sys.platform.startswith("aix"): 44 # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII 45 EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1" 46 EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1" 47elif sys.platform == "darwin": 48 # FS encoding is UTF-8 on macOS 49 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 50elif sys.platform == "cygwin": 51 # Cygwin defaults to using C.UTF-8 52 # TODO: Work out a robust dynamic test for this that doesn't rely on 53 # CPython's own locale handling machinery 54 EXPECT_COERCION_IN_DEFAULT_LOCALE = False 55 56# Note that the above expectations are still wrong in some cases, such as: 57# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set 58# * Any platform other than AIX that uses latin-1 in the C locale 59# * Any Linux distro where POSIX isn't a simple alias for the C locale 60# * Any Linux distro where the default locale is something other than "C" 61# 62# Options for dealing with this: 63# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on 64# such platforms (e.g. it isn't set on Windows) 65# * Fix the test expectations to match the actual platform behaviour 66 67# In order to get the warning messages to match up as expected, the candidate 68# order here must much the target locale order in Python/pylifecycle.c 69_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8") 70 71# There's no reliable cross-platform way of checking locale alias 72# lists, so the only way of knowing which of these locales will work 73# is to try them with locale.setlocale(). We do that in a subprocess 74# in setUpModule() below to avoid altering the locale of the test runner. 75# 76# If the relevant locale module attributes exist, and we're not on a platform 77# where we expect it to always succeed, we also check that 78# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter 79# will skip locale coercion for that particular target locale 80_check_nl_langinfo_CODESET = bool( 81 sys.platform not in ("darwin", "linux") and 82 hasattr(locale, "nl_langinfo") and 83 hasattr(locale, "CODESET") 84) 85 86def _set_locale_in_subprocess(locale_name): 87 cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))" 88 if _check_nl_langinfo_CODESET: 89 # If there's no valid CODESET, we expect coercion to be skipped 90 cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))" 91 cmd = cmd_fmt.format(locale_name) 92 result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='') 93 return result.rc == 0 94 95 96 97_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all" 98_EncodingDetails = namedtuple("EncodingDetails", _fields) 99 100class EncodingDetails(_EncodingDetails): 101 # XXX (ncoghlan): Using JSON for child state reporting may be less fragile 102 CHILD_PROCESS_SCRIPT = ";".join([ 103 "import sys, os", 104 "print(sys.getfilesystemencoding())", 105 "print(sys.stdin.encoding + ':' + sys.stdin.errors)", 106 "print(sys.stdout.encoding + ':' + sys.stdout.errors)", 107 "print(sys.stderr.encoding + ':' + sys.stderr.errors)", 108 "print(os.environ.get('LANG', 'not set'))", 109 "print(os.environ.get('LC_CTYPE', 'not set'))", 110 "print(os.environ.get('LC_ALL', 'not set'))", 111 ]) 112 113 @classmethod 114 def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars): 115 """Returns expected child process details for a given encoding""" 116 _stream = stream_encoding + ":{}" 117 # stdin and stdout should use surrogateescape either because the 118 # coercion triggered, or because the C locale was detected 119 stream_info = 2*[_stream.format("surrogateescape")] 120 # stderr should always use backslashreplace 121 stream_info.append(_stream.format("backslashreplace")) 122 expected_lang = env_vars.get("LANG", "not set").lower() 123 if coercion_expected: 124 expected_lc_ctype = CLI_COERCION_TARGET.lower() 125 else: 126 expected_lc_ctype = env_vars.get("LC_CTYPE", "not set").lower() 127 expected_lc_all = env_vars.get("LC_ALL", "not set").lower() 128 env_info = expected_lang, expected_lc_ctype, expected_lc_all 129 return dict(cls(fs_encoding, *stream_info, *env_info)._asdict()) 130 131 @staticmethod 132 def _handle_output_variations(data): 133 """Adjust the output to handle platform specific idiosyncrasies 134 135 * Some platforms report ASCII as ANSI_X3.4-1968 136 * Some platforms report ASCII as US-ASCII 137 * Some platforms report UTF-8 instead of utf-8 138 """ 139 data = data.replace(b"ANSI_X3.4-1968", b"ascii") 140 data = data.replace(b"US-ASCII", b"ascii") 141 data = data.lower() 142 return data 143 144 @classmethod 145 def get_child_details(cls, env_vars): 146 """Retrieves fsencoding and standard stream details from a child process 147 148 Returns (encoding_details, stderr_lines): 149 150 - encoding_details: EncodingDetails for eager decoding 151 - stderr_lines: result of calling splitlines() on the stderr output 152 153 The child is run in isolated mode if the current interpreter supports 154 that. 155 """ 156 result, py_cmd = run_python_until_end( 157 "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, 158 **env_vars 159 ) 160 if not result.rc == 0: 161 result.fail(py_cmd) 162 # All subprocess outputs in this test case should be pure ASCII 163 adjusted_output = cls._handle_output_variations(result.out) 164 stdout_lines = adjusted_output.decode("ascii").splitlines() 165 child_encoding_details = dict(cls(*stdout_lines)._asdict()) 166 stderr_lines = result.err.decode("ascii").rstrip().splitlines() 167 return child_encoding_details, stderr_lines 168 169 170# Details of the shared library warning emitted at runtime 171LEGACY_LOCALE_WARNING = ( 172 "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " 173 "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " 174 "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " 175 "locales is recommended." 176) 177 178# Details of the CLI locale coercion warning emitted at runtime 179CLI_COERCION_WARNING_FMT = ( 180 "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " 181 "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." 182) 183 184 185AVAILABLE_TARGETS = None 186CLI_COERCION_TARGET = None 187CLI_COERCION_WARNING = None 188 189def setUpModule(): 190 global AVAILABLE_TARGETS 191 global CLI_COERCION_TARGET 192 global CLI_COERCION_WARNING 193 194 if AVAILABLE_TARGETS is not None: 195 # initialization already done 196 return 197 AVAILABLE_TARGETS = [] 198 199 # Find the target locales available in the current system 200 for target_locale in _C_UTF8_LOCALES: 201 if _set_locale_in_subprocess(target_locale): 202 AVAILABLE_TARGETS.append(target_locale) 203 204 if AVAILABLE_TARGETS: 205 # Coercion is expected to use the first available target locale 206 CLI_COERCION_TARGET = AVAILABLE_TARGETS[0] 207 CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET) 208 209 210class _LocaleHandlingTestCase(unittest.TestCase): 211 # Base class to check expected locale handling behaviour 212 213 def _check_child_encoding_details(self, 214 env_vars, 215 expected_fs_encoding, 216 expected_stream_encoding, 217 expected_warnings, 218 coercion_expected): 219 """Check the C locale handling for the given process environment 220 221 Parameters: 222 expected_fs_encoding: expected sys.getfilesystemencoding() result 223 expected_stream_encoding: expected encoding for standard streams 224 expected_warning: stderr output to expect (if any) 225 """ 226 result = EncodingDetails.get_child_details(env_vars) 227 encoding_details, stderr_lines = result 228 expected_details = EncodingDetails.get_expected_details( 229 coercion_expected, 230 expected_fs_encoding, 231 expected_stream_encoding, 232 env_vars 233 ) 234 self.assertEqual(encoding_details, expected_details) 235 if expected_warnings is None: 236 expected_warnings = [] 237 self.assertEqual(stderr_lines, expected_warnings) 238 239 240class LocaleConfigurationTests(_LocaleHandlingTestCase): 241 # Test explicit external configuration via the process environment 242 243 @classmethod 244 def setUpClass(cls): 245 # This relies on setUpModule() having been run, so it can't be 246 # handled via the @unittest.skipUnless decorator 247 if not AVAILABLE_TARGETS: 248 raise unittest.SkipTest("No C-with-UTF-8 locale available") 249 250 def test_external_target_locale_configuration(self): 251 252 # Explicitly setting a target locale should give the same behaviour as 253 # is seen when implicitly coercing to that target locale 254 self.maxDiff = None 255 256 expected_fs_encoding = "utf-8" 257 expected_stream_encoding = "utf-8" 258 259 base_var_dict = { 260 "LANG": "", 261 "LC_CTYPE": "", 262 "LC_ALL": "", 263 "PYTHONCOERCECLOCALE": "", 264 } 265 for env_var in ("LANG", "LC_CTYPE"): 266 for locale_to_set in AVAILABLE_TARGETS: 267 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as 268 # expected, so skip that combination for now 269 # See https://bugs.python.org/issue30672 for discussion 270 if env_var == "LANG" and locale_to_set == "UTF-8": 271 continue 272 273 with self.subTest(env_var=env_var, 274 configured_locale=locale_to_set): 275 var_dict = base_var_dict.copy() 276 var_dict[env_var] = locale_to_set 277 self._check_child_encoding_details(var_dict, 278 expected_fs_encoding, 279 expected_stream_encoding, 280 expected_warnings=None, 281 coercion_expected=False) 282 283 284 285@test.support.cpython_only 286@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), 287 "C locale coercion disabled at build time") 288class LocaleCoercionTests(_LocaleHandlingTestCase): 289 # Test implicit reconfiguration of the environment during CLI startup 290 291 def _check_c_locale_coercion(self, 292 fs_encoding, stream_encoding, 293 coerce_c_locale, 294 expected_warnings=None, 295 coercion_expected=True, 296 **extra_vars): 297 """Check the C locale handling for various configurations 298 299 Parameters: 300 fs_encoding: expected sys.getfilesystemencoding() result 301 stream_encoding: expected encoding for standard streams 302 coerce_c_locale: setting to use for PYTHONCOERCECLOCALE 303 None: don't set the variable at all 304 str: the value set in the child's environment 305 expected_warnings: expected warning lines on stderr 306 extra_vars: additional environment variables to set in subprocess 307 """ 308 self.maxDiff = None 309 310 if not AVAILABLE_TARGETS: 311 # Locale coercion is disabled when there aren't any target locales 312 fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING 313 stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING 314 coercion_expected = False 315 if expected_warnings: 316 expected_warnings = [LEGACY_LOCALE_WARNING] 317 318 base_var_dict = { 319 "LANG": "", 320 "LC_CTYPE": "", 321 "LC_ALL": "", 322 "PYTHONCOERCECLOCALE": "", 323 } 324 base_var_dict.update(extra_vars) 325 if coerce_c_locale is not None: 326 base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale 327 328 # Check behaviour for the default locale 329 with self.subTest(default_locale=True, 330 PYTHONCOERCECLOCALE=coerce_c_locale): 331 if EXPECT_COERCION_IN_DEFAULT_LOCALE: 332 _expected_warnings = expected_warnings 333 _coercion_expected = coercion_expected 334 else: 335 _expected_warnings = None 336 _coercion_expected = False 337 # On Android CLI_COERCION_WARNING is not printed when all the 338 # locale environment variables are undefined or empty. When 339 # this code path is run with environ['LC_ALL'] == 'C', then 340 # LEGACY_LOCALE_WARNING is printed. 341 if (test.support.is_android and 342 _expected_warnings == [CLI_COERCION_WARNING]): 343 _expected_warnings = None 344 self._check_child_encoding_details(base_var_dict, 345 fs_encoding, 346 stream_encoding, 347 _expected_warnings, 348 _coercion_expected) 349 350 # Check behaviour for explicitly configured locales 351 for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS: 352 for env_var in ("LANG", "LC_CTYPE"): 353 with self.subTest(env_var=env_var, 354 nominal_locale=locale_to_set, 355 PYTHONCOERCECLOCALE=coerce_c_locale): 356 var_dict = base_var_dict.copy() 357 var_dict[env_var] = locale_to_set 358 # Check behaviour on successful coercion 359 self._check_child_encoding_details(var_dict, 360 fs_encoding, 361 stream_encoding, 362 expected_warnings, 363 coercion_expected) 364 365 def test_PYTHONCOERCECLOCALE_not_set(self): 366 # This should coerce to the first available target locale by default 367 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None) 368 369 def test_PYTHONCOERCECLOCALE_not_zero(self): 370 # *Any* string other than "0" is considered "set" for our purposes 371 # and hence should result in the locale coercion being enabled 372 for setting in ("", "1", "true", "false"): 373 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting) 374 375 def test_PYTHONCOERCECLOCALE_set_to_warn(self): 376 # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales 377 self._check_c_locale_coercion("utf-8", "utf-8", 378 coerce_c_locale="warn", 379 expected_warnings=[CLI_COERCION_WARNING]) 380 381 382 def test_PYTHONCOERCECLOCALE_set_to_zero(self): 383 # The setting "0" should result in the locale coercion being disabled 384 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 385 EXPECTED_C_LOCALE_STREAM_ENCODING, 386 coerce_c_locale="0", 387 coercion_expected=False) 388 # Setting LC_ALL=C shouldn't make any difference to the behaviour 389 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 390 EXPECTED_C_LOCALE_STREAM_ENCODING, 391 coerce_c_locale="0", 392 LC_ALL="C", 393 coercion_expected=False) 394 395 def test_LC_ALL_set_to_C(self): 396 # Setting LC_ALL should render the locale coercion ineffective 397 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 398 EXPECTED_C_LOCALE_STREAM_ENCODING, 399 coerce_c_locale=None, 400 LC_ALL="C", 401 coercion_expected=False) 402 # And result in a warning about a lack of locale compatibility 403 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 404 EXPECTED_C_LOCALE_STREAM_ENCODING, 405 coerce_c_locale="warn", 406 LC_ALL="C", 407 expected_warnings=[LEGACY_LOCALE_WARNING], 408 coercion_expected=False) 409 410 def test_PYTHONCOERCECLOCALE_set_to_one(self): 411 # skip the test if the LC_CTYPE locale is C or coerced 412 old_loc = locale.setlocale(locale.LC_CTYPE, None) 413 self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc) 414 loc = locale.setlocale(locale.LC_CTYPE, "") 415 if loc == "C": 416 self.skipTest("test requires LC_CTYPE locale different than C") 417 if loc in TARGET_LOCALES : 418 self.skipTest("coerced LC_CTYPE locale: %s" % loc) 419 420 # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale 421 # if it's not equal to "C" 422 code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))' 423 env = dict(os.environ, PYTHONCOERCECLOCALE='1') 424 cmd = subprocess.run([sys.executable, '-c', code], 425 stdout=subprocess.PIPE, 426 env=env, 427 text=True) 428 self.assertEqual(cmd.stdout.rstrip(), loc) 429 430 431def test_main(): 432 test.support.run_unittest( 433 LocaleConfigurationTests, 434 LocaleCoercionTests 435 ) 436 test.support.reap_children() 437 438if __name__ == "__main__": 439 test_main() 440