Mailman 3 December 2017 - Python-checkins

Revert "bpo-32197: Try to fix a compiler error on OS X introduced in bpo-32030. (#4681)" (#4694)
by Victor Stinner 04 Dec '17

04 Dec '17

https://github.com/python/cpython/commit/31a8393cf6a74c870c3484dd68500619f6… commit: 31a8393cf6a74c870c3484dd68500619f6232c6d branch: master author: Victor Stinner <victor.stinner(a)gmail.com> committer: GitHub <noreply(a)github.com> date: 2017-12-04T13:39:15+01:00 summary: Revert "bpo-32197: Try to fix a compiler error on OS X introduced in bpo-32030. (#4681)" (#4694) * Revert "bpo-32197: Try to fix a compiler error on OS X introduced in bpo-32030. (#4681)" This reverts commit 13badcbc60cdbfae1dba1683fd2fae9d70717143. Re-apply commits: * "bpo-32030: _PyPathConfig_Init() sets home and program_name (#4673)" commit af5a895073c24637c094772b27526b94a12ec897. * "bpo-32030: Fix config_get_program_name() on macOS (#4669)" commit e23c06e2b03452c9aaf0dae52296c85e572f9bcd. * "bpo-32030: Add Python/pathconfig.c (#4668)" commit 0ea395ae964c9cd0f499e2ef0d0030c971201220. * "bpo-32030: Don't call _PyPathConfig_Fini() in Py_FinalizeEx() (#4667)" commit ebac19dad6263141d5db0a2c923efe049dba99d2. * "bpo-32030: Fix Py_GetPath(): init program_name (#4665)" commit 9ac3d8882712c9675c3d2f9f84af6b5729575cde. * Fix compilation error on macOS files: A Python/pathconfig.c M Doc/c-api/init.rst M Include/internal/pystate.h M Include/pylifecycle.h M Include/pystate.h M Makefile.pre.in M Modules/getpath.c M Modules/main.c M PC/getpathp.c M PCbuild/pythoncore.vcxproj M PCbuild/pythoncore.vcxproj.filters M Python/pylifecycle.c diff --git a/Doc/c-api/init.rst b/Doc/c-api/init.rst index a9927aba5e1..a3113a390fd 100644 --- a/Doc/c-api/init.rst +++ b/Doc/c-api/init.rst @@ -40,7 +40,6 @@ The following functions can be safely called before Python is initialized: * :c:func:`Py_GetCompiler` * :c:func:`Py_GetCopyright` * :c:func:`Py_GetPlatform` - * :c:func:`Py_GetProgramName` * :c:func:`Py_GetVersion` * Utilities: @@ -59,8 +58,8 @@ The following functions can be safely called before Python is initialized: The following functions **should not be called** before :c:func:`Py_Initialize`: :c:func:`Py_EncodeLocale`, :c:func:`Py_GetPath`, :c:func:`Py_GetPrefix`, :c:func:`Py_GetExecPrefix`, - :c:func:`Py_GetProgramFullPath`, :c:func:`Py_GetPythonHome` and - :c:func:`PyEval_InitThreads`. + :c:func:`Py_GetProgramFullPath`, :c:func:`Py_GetPythonHome`, + :c:func:`Py_GetProgramName` and :c:func:`PyEval_InitThreads`. .. _global-conf-vars: diff --git a/Include/internal/pystate.h b/Include/internal/pystate.h index 50ad2fc83a3..b9334212047 100644 --- a/Include/internal/pystate.h +++ b/Include/internal/pystate.h @@ -48,9 +48,36 @@ typedef struct { #endif /* Set by Py_SetPath(), or computed by _PyPathConfig_Init() */ wchar_t *module_search_path; + /* Python program name */ + wchar_t *program_name; + /* Set by Py_SetPythonHome() or PYTHONHOME environment variable */ + wchar_t *home; } _PyPathConfig; -#define _PyPathConfig_INIT {.module_search_path = NULL} +#ifdef MS_WINDOWS +#define _PyPathConfig_INIT \ + {.program_full_path = NULL, \ + .prefix = NULL, \ + .dll_path = NULL, \ + .module_search_path = NULL, \ + .program_name = NULL, \ + .home = NULL} +#else +#define _PyPathConfig_INIT \ + {.program_full_path = NULL, \ + .prefix = NULL, \ + .exec_prefix = NULL, \ + .module_search_path = NULL, \ + .program_name = NULL, \ + .home = NULL} +#endif + +PyAPI_DATA(_PyPathConfig) _Py_path_config; + +PyAPI_FUNC(_PyInitError) _PyPathConfig_Calculate( + _PyPathConfig *config, + const _PyMainInterpreterConfig *main_config); +PyAPI_FUNC(void) _PyPathConfig_Clear(_PyPathConfig *config); /* Full Python runtime state */ diff --git a/Include/pylifecycle.h b/Include/pylifecycle.h index d32c98b6985..fa751692a66 100644 --- a/Include/pylifecycle.h +++ b/Include/pylifecycle.h @@ -105,11 +105,10 @@ PyAPI_FUNC(wchar_t *) Py_GetPath(void); #ifdef Py_BUILD_CORE PyAPI_FUNC(_PyInitError) _PyPathConfig_Init( const _PyMainInterpreterConfig *main_config); -PyAPI_FUNC(void) _PyPathConfig_Fini(void); #endif PyAPI_FUNC(void) Py_SetPath(const wchar_t *); #ifdef MS_WINDOWS -int _Py_CheckPython3(); +int _Py_CheckPython3(void); #endif /* In their own files */ diff --git a/Include/pystate.h b/Include/pystate.h index 60d001c4926..1d8aab6d83f 100644 --- a/Include/pystate.h +++ b/Include/pystate.h @@ -72,7 +72,8 @@ typedef struct { (_PyMainInterpreterConfig){\ .install_signal_handlers = -1, \ .module_search_path_env = NULL, \ - .home = NULL} + .home = NULL, \ + .program_name = NULL} typedef struct _is { diff --git a/Makefile.pre.in b/Makefile.pre.in index f425a89173a..14f6f8abc54 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -337,8 +337,9 @@ PYTHON_OBJS= \ Python/importdl.o \ Python/marshal.o \ Python/modsupport.o \ - Python/mystrtoul.o \ Python/mysnprintf.o \ + Python/mystrtoul.o \ + Python/pathconfig.o \ Python/peephole.o \ Python/pyarena.o \ Python/pyctype.o \ diff --git a/Modules/getpath.c b/Modules/getpath.c index 9f5e8b3ff5c..fc2b5442ce2 100644 --- a/Modules/getpath.c +++ b/Modules/getpath.c @@ -117,10 +117,7 @@ extern "C" { typedef struct { wchar_t *path_env; /* PATH environment variable */ - wchar_t *home; /* PYTHONHOME environment variable */ - wchar_t *module_search_path_env; /* PYTHONPATH environment variable */ - wchar_t *program_name; /* Program name */ wchar_t *pythonpath; /* PYTHONPATH define */ wchar_t *prefix; /* PREFIX define */ wchar_t *exec_prefix; /* EXEC_PREFIX define */ @@ -135,7 +132,6 @@ typedef struct { static const wchar_t delimiter[2] = {DELIM, '\0'}; static const wchar_t separator[2] = {SEP, '\0'}; -static _PyPathConfig _Py_path_config = _PyPathConfig_INIT; /* Get file status. Encode the path to the locale encoding. */ @@ -360,14 +356,15 @@ find_env_config_value(FILE * env_file, const wchar_t * key, wchar_t * value) bytes long. */ static int -search_for_prefix(PyCalculatePath *calculate, wchar_t *prefix) +search_for_prefix(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, wchar_t *prefix) { size_t n; wchar_t *vpath; /* If PYTHONHOME is set, we believe it unconditionally */ - if (calculate->home) { - wcsncpy(prefix, calculate->home, MAXPATHLEN); + if (main_config->home) { + wcsncpy(prefix, main_config->home, MAXPATHLEN); prefix[MAXPATHLEN] = L'\0'; wchar_t *delim = wcschr(prefix, DELIM); if (delim) { @@ -426,9 +423,10 @@ search_for_prefix(PyCalculatePath *calculate, wchar_t *prefix) static void -calculate_prefix(PyCalculatePath *calculate, wchar_t *prefix) +calculate_prefix(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, wchar_t *prefix) { - calculate->prefix_found = search_for_prefix(calculate, prefix); + calculate->prefix_found = search_for_prefix(main_config, calculate, prefix); if (!calculate->prefix_found) { if (!Py_FrozenFlag) { fprintf(stderr, @@ -470,18 +468,19 @@ calculate_reduce_prefix(PyCalculatePath *calculate, wchar_t *prefix) MAXPATHLEN bytes long. */ static int -search_for_exec_prefix(PyCalculatePath *calculate, wchar_t *exec_prefix) +search_for_exec_prefix(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, wchar_t *exec_prefix) { size_t n; /* If PYTHONHOME is set, we believe it unconditionally */ - if (calculate->home) { - wchar_t *delim = wcschr(calculate->home, DELIM); + if (main_config->home) { + wchar_t *delim = wcschr(main_config->home, DELIM); if (delim) { wcsncpy(exec_prefix, delim+1, MAXPATHLEN); } else { - wcsncpy(exec_prefix, calculate->home, MAXPATHLEN); + wcsncpy(exec_prefix, main_config->home, MAXPATHLEN); } exec_prefix[MAXPATHLEN] = L'\0'; joinpath(exec_prefix, calculate->lib_python); @@ -552,9 +551,12 @@ search_for_exec_prefix(PyCalculatePath *calculate, wchar_t *exec_prefix) static void -calculate_exec_prefix(PyCalculatePath *calculate, wchar_t *exec_prefix) +calculate_exec_prefix(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, wchar_t *exec_prefix) { - calculate->exec_prefix_found = search_for_exec_prefix(calculate, exec_prefix); + calculate->exec_prefix_found = search_for_exec_prefix(main_config, + calculate, + exec_prefix); if (!calculate->exec_prefix_found) { if (!Py_FrozenFlag) { fprintf(stderr, @@ -585,7 +587,8 @@ calculate_reduce_exec_prefix(PyCalculatePath *calculate, wchar_t *exec_prefix) static _PyInitError -calculate_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) +calculate_program_full_path(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, _PyPathConfig *config) { wchar_t program_full_path[MAXPATHLEN+1]; memset(program_full_path, 0, sizeof(program_full_path)); @@ -604,8 +607,8 @@ calculate_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) * other way to find a directory to start the search from. If * $PATH isn't exported, you lose. */ - if (wcschr(calculate->program_name, SEP)) { - wcsncpy(program_full_path, calculate->program_name, MAXPATHLEN); + if (wcschr(main_config->program_name, SEP)) { + wcsncpy(program_full_path, main_config->program_name, MAXPATHLEN); } #ifdef __APPLE__ /* On Mac OS X, if a script uses an interpreter of the form @@ -621,11 +624,13 @@ calculate_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) else if(0 == _NSGetExecutablePath(execpath, &nsexeclength) && execpath[0] == SEP) { - size_t r = mbstowcs(program_full_path, execpath, MAXPATHLEN+1); - if (r == (size_t)-1 || r > MAXPATHLEN) { - /* Could not convert execpath, or it's too long. */ - program_full_path[0] = '\0'; + size_t len; + wchar_t *path = Py_DecodeLocale(execpath, &len); + if (path == NULL) { + return DECODE_LOCALE_ERR("executable path", len); } + wcsncpy(program_full_path, path, MAXPATHLEN); + PyMem_RawFree(path); } #endif /* __APPLE__ */ else if (calculate->path_env) { @@ -645,7 +650,7 @@ calculate_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) wcsncpy(program_full_path, path, MAXPATHLEN); } - joinpath(program_full_path, calculate->program_name); + joinpath(program_full_path, main_config->program_name); if (isxfile(program_full_path)) { break; } @@ -810,14 +815,15 @@ calculate_zip_path(PyCalculatePath *calculate, const wchar_t *prefix) static _PyInitError -calculate_module_search_path(PyCalculatePath *calculate, +calculate_module_search_path(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, const wchar_t *prefix, const wchar_t *exec_prefix, _PyPathConfig *config) { /* Calculate size of return buffer */ size_t bufsz = 0; - if (calculate->module_search_path_env != NULL) { - bufsz += wcslen(calculate->module_search_path_env) + 1; + if (main_config->module_search_path_env != NULL) { + bufsz += wcslen(main_config->module_search_path_env) + 1; } wchar_t *defpath = calculate->pythonpath; @@ -851,8 +857,8 @@ calculate_module_search_path(PyCalculatePath *calculate, buf[0] = '\0'; /* Run-time value of $PYTHONPATH goes first */ - if (calculate->module_search_path_env) { - wcscpy(buf, calculate->module_search_path_env); + if (main_config->module_search_path_env) { + wcscpy(buf, main_config->module_search_path_env); wcscat(buf, delimiter); } @@ -903,10 +909,6 @@ static _PyInitError calculate_init(PyCalculatePath *calculate, const _PyMainInterpreterConfig *main_config) { - calculate->home = main_config->home; - calculate->module_search_path_env = main_config->module_search_path_env; - calculate->program_name = main_config->program_name; - size_t len; char *path = getenv("PATH"); if (path) { @@ -948,9 +950,12 @@ calculate_free(PyCalculatePath *calculate) static _PyInitError -calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config) +calculate_path_impl(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, _PyPathConfig *config) { - _PyInitError err = calculate_program_full_path(calculate, config); + _PyInitError err; + + err = calculate_program_full_path(main_config, calculate, config); if (_Py_INIT_FAILED(err)) { return err; } @@ -964,13 +969,13 @@ calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config) wchar_t prefix[MAXPATHLEN+1]; memset(prefix, 0, sizeof(prefix)); - calculate_prefix(calculate, prefix); + calculate_prefix(main_config, calculate, prefix); calculate_zip_path(calculate, prefix); wchar_t exec_prefix[MAXPATHLEN+1]; memset(exec_prefix, 0, sizeof(exec_prefix)); - calculate_exec_prefix(calculate, exec_prefix); + calculate_exec_prefix(main_config, calculate, exec_prefix); if ((!calculate->prefix_found || !calculate->exec_prefix_found) && !Py_FrozenFlag) @@ -979,8 +984,8 @@ calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config) "Consider setting $PYTHONHOME to <prefix>[:<exec_prefix>]\n"); } - err = calculate_module_search_path(calculate, prefix, exec_prefix, - config); + err = calculate_module_search_path(main_config, calculate, + prefix, exec_prefix, config); if (_Py_INIT_FAILED(err)) { return err; } @@ -1003,33 +1008,10 @@ calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config) } -static void -pathconfig_clear(_PyPathConfig *config) -{ -#define CLEAR(ATTR) \ - do { \ - PyMem_RawFree(ATTR); \ - ATTR = NULL; \ - } while (0) - - CLEAR(config->prefix); - CLEAR(config->exec_prefix); - CLEAR(config->program_full_path); - CLEAR(config->module_search_path); -#undef CLEAR -} - - -/* Initialize paths for Py_GetPath(), Py_GetPrefix(), Py_GetExecPrefix() - and Py_GetProgramFullPath() */ _PyInitError -_PyPathConfig_Init(const _PyMainInterpreterConfig *main_config) +_PyPathConfig_Calculate(_PyPathConfig *config, + const _PyMainInterpreterConfig *main_config) { - if (_Py_path_config.module_search_path) { - /* Already initialized */ - return _Py_INIT_OK(); - } - PyCalculatePath calculate; memset(&calculate, 0, sizeof(calculate)); @@ -1038,16 +1020,11 @@ _PyPathConfig_Init(const _PyMainInterpreterConfig *main_config) goto done; } - _PyPathConfig new_path_config; - memset(&new_path_config, 0, sizeof(new_path_config)); - - err = calculate_path_impl(&calculate, &new_path_config); + err = calculate_path_impl(main_config, &calculate, config); if (_Py_INIT_FAILED(err)) { - pathconfig_clear(&new_path_config); goto done; } - _Py_path_config = new_path_config; err = _Py_INIT_OK(); done: @@ -1055,88 +1032,6 @@ _PyPathConfig_Init(const _PyMainInterpreterConfig *main_config) return err; } - -static void -pathconfig_global_init(void) -{ - if (_Py_path_config.module_search_path) { - /* Already initialized */ - return; - } - - _PyInitError err; - _PyMainInterpreterConfig config = _PyMainInterpreterConfig_INIT; - - err = _PyMainInterpreterConfig_ReadEnv(&config); - if (!_Py_INIT_FAILED(err)) { - err = _PyPathConfig_Init(&config); - } - _PyMainInterpreterConfig_Clear(&config); - - if (_Py_INIT_FAILED(err)) { - _Py_FatalInitError(err); - } -} - - -void -_PyPathConfig_Fini(void) -{ - pathconfig_clear(&_Py_path_config); -} - - -/* External interface */ -void -Py_SetPath(const wchar_t *path) -{ - if (path == NULL) { - pathconfig_clear(&_Py_path_config); - return; - } - - _PyPathConfig new_config; - new_config.program_full_path = _PyMem_RawWcsdup(Py_GetProgramName()); - new_config.exec_prefix = _PyMem_RawWcsdup(L""); - new_config.prefix = _PyMem_RawWcsdup(L""); - new_config.module_search_path = _PyMem_RawWcsdup(path); - - pathconfig_clear(&_Py_path_config); - _Py_path_config = new_config; -} - - -wchar_t * -Py_GetPath(void) -{ - pathconfig_global_init(); - return _Py_path_config.module_search_path; -} - - -wchar_t * -Py_GetPrefix(void) -{ - pathconfig_global_init(); - return _Py_path_config.prefix; -} - - -wchar_t * -Py_GetExecPrefix(void) -{ - pathconfig_global_init(); - return _Py_path_config.exec_prefix; -} - - -wchar_t * -Py_GetProgramFullPath(void) -{ - pathconfig_global_init(); - return _Py_path_config.program_full_path; -} - #ifdef __cplusplus } #endif diff --git a/Modules/main.c b/Modules/main.c index e9d524a1463..4095259b88c 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -412,7 +412,6 @@ typedef struct { /* non-zero if filename, command (-c) or module (-m) is set on the command line */ int run_code; - wchar_t *program_name; /* Error message if a function failed */ _PyInitError err; /* PYTHONWARNINGS env var */ @@ -429,7 +428,6 @@ typedef struct { .config = _PyMainInterpreterConfig_INIT, \ .main_importer_path = NULL, \ .run_code = -1, \ - .program_name = NULL, \ .err = _Py_INIT_OK(), \ .env_warning_options = {0, NULL}} @@ -455,7 +453,6 @@ pymain_free_impl(_PyMain *pymain) pymain_optlist_clear(&pymain->env_warning_options); Py_CLEAR(pymain->main_importer_path); - PyMem_RawFree(pymain->program_name); _PyMainInterpreterConfig_Clear(&pymain->config); @@ -874,14 +871,21 @@ pymain_init_stdio(_PyMain *pymain) /* Get the program name: use PYTHONEXECUTABLE and __PYVENV_LAUNCHER__ - environment variables on macOS if available, use argv[0] by default. - - Return 0 on success. - Set pymain->err and return -1 on error. */ -static int -pymain_get_program_name(_PyMain *pymain) + environment variables on macOS if available. */ +static _PyInitError +config_get_program_name(_PyMainInterpreterConfig *config) { - assert(pymain->program_name == NULL); + assert(config->program_name == NULL); + + /* If Py_SetProgramName() was called, use its value */ + wchar_t *program_name = _Py_path_config.program_name; + if (program_name != NULL) { + config->program_name = _PyMem_RawWcsdup(program_name); + if (config->program_name == NULL) { + return _Py_INIT_NO_MEMORY(); + } + } + #ifdef __APPLE__ char *p; /* On MacOS X, when the Python interpreter is embedded in an @@ -894,17 +898,13 @@ pymain_get_program_name(_PyMain *pymain) See Lib/plat-mac/bundlebuiler.py for details about the bootstrap script. */ if ((p = Py_GETENV("PYTHONEXECUTABLE")) && *p != '\0') { - wchar_t* buffer; - size_t len = strlen(p) + 1; - - buffer = PyMem_RawMalloc(len * sizeof(wchar_t)); - if (buffer == NULL) { - pymain->err = _Py_INIT_NO_MEMORY(); - return -1; + size_t len; + wchar_t* program_name = Py_DecodeLocale(p, &len); + if (program_name == NULL) { + return DECODE_LOCALE_ERR("PYTHONEXECUTABLE environment " + "variable", (Py_ssize_t)len); } - - mbstowcs(buffer, p, len); - pymain->program_name = buffer; + config->program_name = program_name; } #ifdef WITH_NEXT_FRAMEWORK else { @@ -914,21 +914,30 @@ pymain_get_program_name(_PyMain *pymain) * the argv0 of the stub executable */ size_t len; - wchar_t* wbuf = Py_DecodeLocale(pyvenv_launcher, &len); - if (wbuf == NULL) { - SET_DECODE_ERROR("__PYVENV_LAUNCHER__", len); - return -1; + wchar_t* program_name = Py_DecodeLocale(pyvenv_launcher, &len); + if (program_name == NULL) { + return DECODE_LOCALE_ERR("__PYVENV_LAUNCHER__ environment " + "variable", (Py_ssize_t)len); } - pymain->program_name = wbuf; + config->program_name = program_name; } } #endif /* WITH_NEXT_FRAMEWORK */ #endif /* __APPLE__ */ - if (pymain->program_name == NULL) { + return _Py_INIT_OK(); +} + + +/* If config_get_program_name() found no program name: use argv[0] by default. + Return 0 on success. Set pymain->err and return -1 on error. */ +static int +pymain_get_program_name(_PyMain *pymain) +{ + if (pymain->config.program_name == NULL) { /* Use argv[0] by default */ - pymain->program_name = pymain_wstrdup(pymain, pymain->argv[0]); - if (pymain->program_name == NULL) { + pymain->config.program_name = pymain_wstrdup(pymain, pymain->argv[0]); + if (pymain->config.program_name == NULL) { return -1; } } @@ -950,13 +959,6 @@ pymain_init_main_interpreter(_PyMain *pymain) { _PyInitError err; - /* TODO: Print any exceptions raised by these operations */ - err = _PyMainInterpreterConfig_Read(&pymain->config); - if (_Py_INIT_FAILED(err)) { - pymain->err = err; - return -1; - } - err = _Py_InitializeMainInterpreter(&pymain->config); if (_Py_INIT_FAILED(err)) { pymain->err = err; @@ -1414,14 +1416,13 @@ config_init_pythonpath(_PyMainInterpreterConfig *config) static _PyInitError -config_init_pythonhome(_PyMainInterpreterConfig *config) +config_init_home(_PyMainInterpreterConfig *config) { wchar_t *home; - home = Py_GetPythonHome(); + /* If Py_SetPythonHome() was called, use its value */ + home = _Py_path_config.home; if (home) { - /* Py_SetPythonHome() has been called before Py_Main(), - use its value */ config->home = _PyMem_RawWcsdup(home); if (config->home == NULL) { return _Py_INIT_NO_MEMORY(); @@ -1441,7 +1442,7 @@ config_init_pythonhome(_PyMainInterpreterConfig *config) _PyInitError _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config) { - _PyInitError err = config_init_pythonhome(config); + _PyInitError err = config_init_home(config); if (_Py_INIT_FAILED(err)) { return err; } @@ -1451,11 +1452,9 @@ _PyMainInterpreterConfig_ReadEnv(_PyMainInterpreterConfig *config) return err; } - /* FIXME: _PyMainInterpreterConfig_Read() has the same code. Remove it - here? See also pymain_get_program_name() and pymain_parse_envvars(). */ - config->program_name = _PyMem_RawWcsdup(Py_GetProgramName()); - if (config->program_name == NULL) { - return _Py_INIT_NO_MEMORY(); + err = config_get_program_name(config); + if (_Py_INIT_FAILED(err)) { + return err; } return _Py_INIT_OK(); @@ -1481,25 +1480,17 @@ pymain_parse_envvars(_PyMain *pymain) if (pymain_warnings_envvar(pymain) < 0) { return -1; } - if (pymain_get_program_name(pymain) < 0) { - return -1; - } - core_config->allocator = Py_GETENV("PYTHONMALLOC"); - - /* FIXME: move pymain_get_program_name() code into - _PyMainInterpreterConfig_ReadEnv(). - Problem: _PyMainInterpreterConfig_ReadEnv() doesn't have access - to argv[0]. */ - Py_SetProgramName(pymain->program_name); - /* Don't free program_name here: the argument to Py_SetProgramName - must remain valid until Py_FinalizeEx is called. The string is freed - by pymain_free(). */ _PyInitError err = _PyMainInterpreterConfig_ReadEnv(&pymain->config); if (_Py_INIT_FAILED(pymain->err)) { pymain->err = err; return -1; } + if (pymain_get_program_name(pymain) < 0) { + return -1; + } + + core_config->allocator = Py_GETENV("PYTHONMALLOC"); /* -X options */ if (pymain_get_xoption(pymain, L"showrefcount")) { @@ -1555,6 +1546,12 @@ pymain_parse_cmdline_envvars_impl(_PyMain *pymain) return -1; } + _PyInitError err = _PyMainInterpreterConfig_Read(&pymain->config); + if (_Py_INIT_FAILED(err)) { + pymain->err = err; + return -1; + } + return 0; } @@ -1671,6 +1668,14 @@ pymain_impl(_PyMain *pymain) other special meaning */ pymain->status = 120; } + + /* _PyPathConfig_Clear() cannot be called in Py_FinalizeEx(). + Py_Initialize() and Py_Finalize() can be called multiple times, but it + must not "forget" parameters set by Py_SetProgramName(), Py_SetPath() or + Py_SetPythonHome(), whereas _PyPathConfig_Clear() clear all these + parameters. */ + _PyPathConfig_Clear(&_Py_path_config); + return 0; } diff --git a/PC/getpathp.c b/PC/getpathp.c index ad04b6b6efc..08ed8ccc83f 100644 --- a/PC/getpathp.c +++ b/PC/getpathp.c @@ -118,7 +118,6 @@ #endif typedef struct { - wchar_t *module_search_path_env; /* PYTHONPATH environment variable */ wchar_t *path_env; /* PATH environment variable */ wchar_t *home; /* PYTHONHOME environment variable */ @@ -126,15 +125,11 @@ typedef struct { wchar_t *machine_path; /* from HKEY_LOCAL_MACHINE */ wchar_t *user_path; /* from HKEY_CURRENT_USER */ - wchar_t *program_name; /* Program name */ wchar_t argv0_path[MAXPATHLEN+1]; wchar_t zip_path[MAXPATHLEN+1]; } PyCalculatePath; -static _PyPathConfig _Py_path_config = _PyPathConfig_INIT; - - /* determine if "ch" is a separator character */ static int is_sep(wchar_t ch) @@ -503,7 +498,8 @@ get_dll_path(PyCalculatePath *calculate, _PyPathConfig *config) static _PyInitError -get_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) +get_program_full_path(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, _PyPathConfig *config) { wchar_t program_full_path[MAXPATHLEN+1]; memset(program_full_path, 0, sizeof(program_full_path)); @@ -518,12 +514,13 @@ get_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) * $PATH isn't exported, you lose. */ #ifdef ALTSEP - if (wcschr(calculate->program_name, SEP) || wcschr(calculate->program_name, ALTSEP)) + if (wcschr(main_config->program_name, SEP) || + wcschr(main_config->program_name, ALTSEP)) #else - if (wcschr(calculate->program_name, SEP)) + if (wcschr(main_config->program_name, SEP)) #endif { - wcsncpy(program_full_path, calculate->program_name, MAXPATHLEN); + wcsncpy(program_full_path, main_config->program_name, MAXPATHLEN); } else if (calculate->path_env) { wchar_t *path = calculate->path_env; @@ -542,7 +539,7 @@ get_program_full_path(PyCalculatePath *calculate, _PyPathConfig *config) } /* join() is safe for MAXPATHLEN+1 size buffer */ - join(program_full_path, calculate->program_name); + join(program_full_path, main_config->program_name); if (exists(program_full_path)) { break; } @@ -713,9 +710,6 @@ calculate_init(PyCalculatePath *calculate, const _PyMainInterpreterConfig *main_config) { calculate->home = main_config->home; - calculate->module_search_path_env = main_config->module_search_path_env; - calculate->program_name = main_config->program_name; - calculate->path_env = _wgetenv(L"PATH"); } @@ -724,12 +718,16 @@ static int get_pth_filename(wchar_t *spbuffer, _PyPathConfig *config) { if (config->dll_path[0]) { - if (!change_ext(spbuffer, config->dll_path, L"._pth") && exists(spbuffer)) { + if (!change_ext(spbuffer, config->dll_path, L"._pth") && + exists(spbuffer)) + { return 1; } } if (config->program_full_path[0]) { - if (!change_ext(spbuffer, config->program_full_path, L"._pth") && exists(spbuffer)) { + if (!change_ext(spbuffer, config->program_full_path, L"._pth") && + exists(spbuffer)) + { return 1; } } @@ -815,7 +813,9 @@ calculate_home_prefix(PyCalculatePath *calculate, wchar_t *prefix) static _PyInitError -calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, wchar_t *prefix) +calculate_module_search_path(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, _PyPathConfig *config, + wchar_t *prefix) { int skiphome = calculate->home==NULL ? 0 : 1; #ifdef Py_ENABLE_SHARED @@ -824,8 +824,10 @@ calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, #endif /* We only use the default relative PYTHONPATH if we haven't anything better to use! */ - int skipdefault = (calculate->module_search_path_env!=NULL || calculate->home!=NULL || \ - calculate->machine_path!=NULL || calculate->user_path!=NULL); + int skipdefault = (main_config->module_search_path_env != NULL || + calculate->home != NULL || + calculate->machine_path != NULL || + calculate->user_path != NULL); /* We need to construct a path from the following parts. (1) the PYTHONPATH environment variable, if set; @@ -861,8 +863,8 @@ calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, bufsz += wcslen(calculate->machine_path) + 1; } bufsz += wcslen(calculate->zip_path) + 1; - if (calculate->module_search_path_env != NULL) { - bufsz += wcslen(calculate->module_search_path_env) + 1; + if (main_config->module_search_path_env != NULL) { + bufsz += wcslen(main_config->module_search_path_env) + 1; } wchar_t *buf, *start_buf; @@ -870,9 +872,9 @@ calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, if (buf == NULL) { /* We can't exit, so print a warning and limp along */ fprintf(stderr, "Can't malloc dynamic PYTHONPATH.\n"); - if (calculate->module_search_path_env) { + if (main_config->module_search_path_env) { fprintf(stderr, "Using environment $PYTHONPATH.\n"); - config->module_search_path = calculate->module_search_path_env; + config->module_search_path = main_config->module_search_path_env; } else { fprintf(stderr, "Using default static path.\n"); @@ -882,8 +884,9 @@ calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, } start_buf = buf; - if (calculate->module_search_path_env) { - if (wcscpy_s(buf, bufsz - (buf - start_buf), calculate->module_search_path_env)) { + if (main_config->module_search_path_env) { + if (wcscpy_s(buf, bufsz - (buf - start_buf), + main_config->module_search_path_env)) { return INIT_ERR_BUFFER_OVERFLOW(); } buf = wcschr(buf, L'\0'); @@ -996,8 +999,8 @@ calculate_module_search_path(PyCalculatePath *calculate, _PyPathConfig *config, static _PyInitError -calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config, - const _PyMainInterpreterConfig *main_config) +calculate_path_impl(const _PyMainInterpreterConfig *main_config, + PyCalculatePath *calculate, _PyPathConfig *config) { _PyInitError err; @@ -1006,7 +1009,7 @@ calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config, return err; } - err = get_program_full_path(calculate, config); + err = get_program_full_path(main_config, calculate, config); if (_Py_INIT_FAILED(err)) { return err; } @@ -1032,7 +1035,7 @@ calculate_path_impl(PyCalculatePath *calculate, _PyPathConfig *config, calculate_home_prefix(calculate, prefix); - err = calculate_module_search_path(calculate, config, prefix); + err = calculate_module_search_path(main_config, calculate, config, prefix); if (_Py_INIT_FAILED(err)) { return err; } @@ -1055,145 +1058,28 @@ calculate_free(PyCalculatePath *calculate) } -static void -pathconfig_clear(_PyPathConfig *config) -{ -#define CLEAR(ATTR) \ - do { \ - PyMem_RawFree(ATTR); \ - ATTR = NULL; \ - } while (0) - - CLEAR(config->prefix); - CLEAR(config->program_full_path); - CLEAR(config->dll_path); - CLEAR(config->module_search_path); -#undef CLEAR -} - - -/* Initialize paths for Py_GetPath(), Py_GetPrefix(), Py_GetExecPrefix() - and Py_GetProgramFullPath() */ _PyInitError -_PyPathConfig_Init(const _PyMainInterpreterConfig *main_config) +_PyPathConfig_Calculate(_PyPathConfig *config, + const _PyMainInterpreterConfig *main_config) { - if (_Py_path_config.module_search_path) { - /* Already initialized */ - return _Py_INIT_OK(); - } - - _PyInitError err; - PyCalculatePath calculate; memset(&calculate, 0, sizeof(calculate)); calculate_init(&calculate, main_config); - _PyPathConfig new_path_config; - memset(&new_path_config, 0, sizeof(new_path_config)); - - err = calculate_path_impl(&calculate, &new_path_config, main_config); + _PyInitError err = calculate_path_impl(main_config, &calculate, config); if (_Py_INIT_FAILED(err)) { goto done; } - _Py_path_config = new_path_config; err = _Py_INIT_OK(); done: - if (_Py_INIT_FAILED(err)) { - pathconfig_clear(&new_path_config); - } calculate_free(&calculate); return err; } -static void -pathconfig_global_init(void) -{ - if (_Py_path_config.module_search_path) { - /* Already initialized */ - return; - } - - _PyInitError err; - _PyMainInterpreterConfig config = _PyMainInterpreterConfig_INIT; - - err = _PyMainInterpreterConfig_ReadEnv(&config); - if (!_Py_INIT_FAILED(err)) { - err = _PyPathConfig_Init(&config); - } - _PyMainInterpreterConfig_Clear(&config); - - if (_Py_INIT_FAILED(err)) { - _Py_FatalInitError(err); - } -} - - -void -_PyPathConfig_Fini(void) -{ - pathconfig_clear(&_Py_path_config); -} - - -/* External interface */ - -void -Py_SetPath(const wchar_t *path) -{ - if (_Py_path_config.module_search_path != NULL) { - pathconfig_clear(&_Py_path_config); - } - - if (path == NULL) { - return; - } - - _PyPathConfig new_config; - new_config.program_full_path = _PyMem_RawWcsdup(Py_GetProgramName()); - new_config.prefix = _PyMem_RawWcsdup(L""); - new_config.dll_path = _PyMem_RawWcsdup(L""); - new_config.module_search_path = _PyMem_RawWcsdup(path); - - pathconfig_clear(&_Py_path_config); - _Py_path_config = new_config; -} - - -wchar_t * -Py_GetPath(void) -{ - pathconfig_global_init(); - return _Py_path_config.module_search_path; -} - - -wchar_t * -Py_GetPrefix(void) -{ - pathconfig_global_init(); - return _Py_path_config.prefix; -} - - -wchar_t * -Py_GetExecPrefix(void) -{ - return Py_GetPrefix(); -} - - -wchar_t * -Py_GetProgramFullPath(void) -{ - pathconfig_global_init(); - return _Py_path_config.program_full_path; -} - - /* Load python3.dll before loading any extension module that might refer to it. That way, we can be sure that always the python3.dll corresponding to this python DLL is loaded, not a python3.dll that might be on the path @@ -1203,7 +1089,7 @@ Py_GetProgramFullPath(void) static int python3_checked = 0; static HANDLE hPython3; int -_Py_CheckPython3() +_Py_CheckPython3(void) { wchar_t py3path[MAXPATHLEN+1]; wchar_t *s; diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 3793cbda882..b430e05b629 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -381,6 +381,7 @@ <ClCompile Include="..\Python\modsupport.c" /> <ClCompile Include="..\Python\mysnprintf.c" /> <ClCompile Include="..\Python\mystrtoul.c" /> + <ClCompile Include="..\Python\pathconfig.c" /> <ClCompile Include="..\Python\peephole.c" /> <ClCompile Include="..\Python\pyarena.c" /> <ClCompile Include="..\Python\pyctype.c" /> diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 1d33c6e2cc2..c9aa3da355e 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -896,6 +896,9 @@ <ClCompile Include="..\Python\mystrtoul.c"> <Filter>Python</Filter> </ClCompile> + <ClCompile Include="..\Python\pathconfig.c"> + <Filter>Python</Filter> + </ClCompile> <ClCompile Include="..\Python\peephole.c"> <Filter>Python</Filter> </ClCompile> diff --git a/Python/pathconfig.c b/Python/pathconfig.c new file mode 100644 index 00000000000..6a03f7dca1b --- /dev/null +++ b/Python/pathconfig.c @@ -0,0 +1,266 @@ +/* Path configuration like module_search_path (sys.path) */ + +#include "Python.h" +#include "osdefs.h" +#include "internal/pystate.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +_PyPathConfig _Py_path_config = _PyPathConfig_INIT; + + +void +_PyPathConfig_Clear(_PyPathConfig *config) +{ + /* _PyMem_SetDefaultAllocator() is needed to get a known memory allocator, + since Py_SetPath(), Py_SetPythonHome() and Py_SetProgramName() can be + called before Py_Initialize() which can changes the memory allocator. */ + PyMemAllocatorEx old_alloc; + _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + +#define CLEAR(ATTR) \ + do { \ + PyMem_RawFree(ATTR); \ + ATTR = NULL; \ + } while (0) + + CLEAR(config->prefix); + CLEAR(config->program_full_path); +#ifdef MS_WINDOWS + CLEAR(config->dll_path); +#else + CLEAR(config->exec_prefix); +#endif + CLEAR(config->module_search_path); + CLEAR(config->home); + CLEAR(config->program_name); +#undef CLEAR + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); +} + + +/* Initialize paths for Py_GetPath(), Py_GetPrefix(), Py_GetExecPrefix() + and Py_GetProgramFullPath() */ +_PyInitError +_PyPathConfig_Init(const _PyMainInterpreterConfig *main_config) +{ + if (_Py_path_config.module_search_path) { + /* Already initialized */ + return _Py_INIT_OK(); + } + + _PyInitError err; + _PyPathConfig new_config = _PyPathConfig_INIT; + + PyMemAllocatorEx old_alloc; + _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + /* Calculate program_full_path, prefix, exec_prefix (Unix) + or dll_path (Windows), and module_search_path */ + err = _PyPathConfig_Calculate(&new_config, main_config); + if (_Py_INIT_FAILED(err)) { + _PyPathConfig_Clear(&new_config); + goto done; + } + + /* Copy home and program_name from main_config */ + if (main_config->home != NULL) { + new_config.home = _PyMem_RawWcsdup(main_config->home); + if (new_config.home == NULL) { + err = _Py_INIT_NO_MEMORY(); + goto done; + } + } + else { + new_config.home = NULL; + } + + new_config.program_name = _PyMem_RawWcsdup(main_config->program_name); + if (new_config.program_name == NULL) { + err = _Py_INIT_NO_MEMORY(); + goto done; + } + + _PyPathConfig_Clear(&_Py_path_config); + _Py_path_config = new_config; + + err = _Py_INIT_OK(); + +done: + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + return err; +} + + +static void +pathconfig_global_init(void) +{ + if (_Py_path_config.module_search_path) { + /* Already initialized */ + return; + } + + _PyInitError err; + _PyMainInterpreterConfig config = _PyMainInterpreterConfig_INIT; + + err = _PyMainInterpreterConfig_ReadEnv(&config); + if (_Py_INIT_FAILED(err)) { + goto error; + } + + err = _PyMainInterpreterConfig_Read(&config); + if (_Py_INIT_FAILED(err)) { + goto error; + } + + err = _PyPathConfig_Init(&config); + if (_Py_INIT_FAILED(err)) { + goto error; + } + + _PyMainInterpreterConfig_Clear(&config); + return; + +error: + _PyMainInterpreterConfig_Clear(&config); + _Py_FatalInitError(err); +} + + +/* External interface */ + +void +Py_SetPath(const wchar_t *path) +{ + if (path == NULL) { + _PyPathConfig_Clear(&_Py_path_config); + return; + } + + PyMemAllocatorEx old_alloc; + _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + _PyPathConfig new_config; + new_config.program_full_path = _PyMem_RawWcsdup(Py_GetProgramName()); + new_config.prefix = _PyMem_RawWcsdup(L""); +#ifdef MS_WINDOWS + new_config.dll_path = _PyMem_RawWcsdup(L""); +#else + new_config.exec_prefix = _PyMem_RawWcsdup(L""); +#endif + new_config.module_search_path = _PyMem_RawWcsdup(path); + + /* steal the home and program_name values (to leave them unchanged) */ + new_config.home = _Py_path_config.home; + _Py_path_config.home = NULL; + new_config.program_name = _Py_path_config.program_name; + _Py_path_config.program_name = NULL; + + _PyPathConfig_Clear(&_Py_path_config); + _Py_path_config = new_config; + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); +} + + +void +Py_SetPythonHome(wchar_t *home) +{ + if (home == NULL) { + return; + } + + PyMemAllocatorEx old_alloc; + _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + PyMem_RawFree(_Py_path_config.home); + _Py_path_config.home = _PyMem_RawWcsdup(home); + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + if (_Py_path_config.home == NULL) { + Py_FatalError("Py_SetPythonHome() failed: out of memory"); + } +} + + +void +Py_SetProgramName(wchar_t *program_name) +{ + if (program_name == NULL || program_name[0] == L'\0') { + return; + } + + PyMemAllocatorEx old_alloc; + _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + PyMem_RawFree(_Py_path_config.program_name); + _Py_path_config.program_name = _PyMem_RawWcsdup(program_name); + + PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + + if (_Py_path_config.program_name == NULL) { + Py_FatalError("Py_SetProgramName() failed: out of memory"); + } +} + + +wchar_t * +Py_GetPath(void) +{ + pathconfig_global_init(); + return _Py_path_config.module_search_path; +} + + +wchar_t * +Py_GetPrefix(void) +{ + pathconfig_global_init(); + return _Py_path_config.prefix; +} + + +wchar_t * +Py_GetExecPrefix(void) +{ +#ifdef MS_WINDOWS + return Py_GetPrefix(); +#else + pathconfig_global_init(); + return _Py_path_config.exec_prefix; +#endif +} + + +wchar_t * +Py_GetProgramFullPath(void) +{ + pathconfig_global_init(); + return _Py_path_config.program_full_path; +} + + +wchar_t* +Py_GetPythonHome(void) +{ + pathconfig_global_init(); + return _Py_path_config.home; +} + + +wchar_t * +Py_GetProgramName(void) +{ + pathconfig_global_init(); + return _Py_path_config.program_name; +} + + +#ifdef __cplusplus +} +#endif diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f0a49f91fb8..523397f1269 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -804,7 +804,12 @@ _PyMainInterpreterConfig_Read(_PyMainInterpreterConfig *config) } if (config->program_name == NULL) { - config->program_name = _PyMem_RawWcsdup(Py_GetProgramName()); +#ifdef MS_WINDOWS + const wchar_t *program_name = L"python"; +#else + const wchar_t *program_name = L"python3"; +#endif + config->program_name = _PyMem_RawWcsdup(program_name); if (config->program_name == NULL) { return _Py_INIT_NO_MEMORY(); } @@ -1273,8 +1278,6 @@ Py_FinalizeEx(void) call_ll_exitfuncs(); - _PyPathConfig_Fini(); - _PyRuntime_Finalize(); return status; } @@ -1491,61 +1494,6 @@ Py_EndInterpreter(PyThreadState *tstate) PyInterpreterState_Delete(interp); } -#ifdef MS_WINDOWS -static wchar_t *progname = L"python"; -#else -static wchar_t *progname = L"python3"; -#endif - -void -Py_SetProgramName(wchar_t *pn) -{ - if (pn && *pn) - progname = pn; -} - -wchar_t * -Py_GetProgramName(void) -{ - return progname; -} - -static wchar_t *default_home = NULL; - -void -Py_SetPythonHome(wchar_t *home) -{ - default_home = home; -} - - -wchar_t* -Py_GetPythonHome(void) -{ - /* Use a static buffer to avoid heap memory allocation failure. - Py_GetPythonHome() doesn't allow to report error, and the caller - doesn't release memory. */ - static wchar_t buffer[MAXPATHLEN+1]; - - if (default_home) { - return default_home; - } - - char *home = Py_GETENV("PYTHONHOME"); - if (!home) { - return NULL; - } - - size_t size = Py_ARRAY_LENGTH(buffer); - size_t r = mbstowcs(buffer, home, size); - if (r == (size_t)-1 || r >= size) { - /* conversion failed or the static buffer is too small */ - return NULL; - } - - return buffer; -} - /* Add the __main__ module */ static _PyInitError

1 0

bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471)
by Serhiy Storchaka 04 Dec '17

04 Dec '17

https://github.com/python/cpython/commit/70d56fb52582d9d3f7c00860d6e90570c6… commit: 70d56fb52582d9d3f7c00860d6e90570c6259371 branch: master author: Serhiy Storchaka <storchaka(a)gmail.com> committer: GitHub <noreply(a)github.com> date: 2017-12-04T14:29:05+02:00 summary: bpo-25054, bpo-1647489: Added support of splitting on zerowidth patterns. (#4471) Also fixed searching patterns that could match an empty string. files: A Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst A Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst M Doc/library/re.rst M Doc/whatsnew/3.7.rst M Lib/doctest.py M Lib/test/test_re.py M Modules/_sre.c M Modules/sre.h M Modules/sre_lib.h diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 8e6eb30f836..dae1d7ea10a 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -708,37 +708,19 @@ form. That way, separator components are always found at the same relative indices within the result list. - .. note:: - - :func:`split` doesn't currently split a string on an empty pattern match. - For example:: - - >>> re.split('x*', 'axbc') - ['a', 'bc'] + The pattern can match empty strings. :: - Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c', - and after 'c', currently these matches are ignored. The correct behavior - (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c', - '']``) will be implemented in future versions of Python, but since this - is a backward incompatible change, a :exc:`FutureWarning` will be raised - in the meanwhile. - - Patterns that can only match empty strings currently never split the - string. Since this doesn't match the expected behavior, a - :exc:`ValueError` will be raised starting from Python 3.5:: - - >>> re.split("^$", "foo\n\nbar\n", flags=re.M) - Traceback (most recent call last): - File "<stdin>", line 1, in <module> - ... - ValueError: split() requires a non-empty pattern match. + >>> re.split(r'\b', 'Words, words, words.') + ['', 'Words', ', ', 'words', ', ', 'words', '.'] + >>> re.split(r'(\W*)', '...words...') + ['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', ''] .. versionchanged:: 3.1 Added the optional flags argument. - .. versionchanged:: 3.5 - Splitting on a pattern that could match an empty string now raises - a warning. Patterns that can only match empty strings are now rejected. + .. versionchanged:: 3.7 + Added support of splitting on a pattern that could match an empty string. + .. function:: findall(pattern, string, flags=0) @@ -746,8 +728,10 @@ form. strings. The *string* is scanned left-to-right, and matches are returned in the order found. If one or more groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than - one group. Empty matches are included in the result unless they touch the - beginning of another match. + one group. Empty matches are included in the result. + + .. versionchanged:: 3.7 + Non-empty matches can now start just after a previous empty match. .. function:: finditer(pattern, string, flags=0) @@ -755,8 +739,10 @@ form. Return an :term:`iterator` yielding :ref:`match objects <match-objects>` over all non-overlapping matches for the RE *pattern* in *string*. The *string* is scanned left-to-right, and matches are returned in the order found. Empty - matches are included in the result unless they touch the beginning of another - match. + matches are included in the result. + + .. versionchanged:: 3.7 + Non-empty matches can now start just after a previous empty match. .. function:: sub(pattern, repl, string, count=0, flags=0) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index b6dad4eab6b..3d23aa773d7 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -364,6 +364,10 @@ The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE` can be set within the scope of a group. (Contributed by Serhiy Storchaka in :issue:`31690`.) +:func:`re.split` now supports splitting on a pattern like ``r'\b'``, +``'^$'`` or ``(?=-)`` that matches an empty string. +(Contributed by Serhiy Storchaka in :issue:`25054`.) + string ------ @@ -768,6 +772,23 @@ Changes in the Python API avoid a warning escape them with a backslash. (Contributed by Serhiy Storchaka in :issue:`30349`.) +* The result of splitting a string on a :mod:`regular expression <re>` + that could match an empty string has been changed. For example + splitting on ``r'\s*'`` will now split not only on whitespaces as it + did previously, but also between any pair of non-whitespace + characters. The previous behavior can be restored by changing the pattern + to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since + Python 3.5. + + For patterns that match both empty and non-empty strings, the result of + searching for all matches may also be changed in other cases. For example + in the string ``'a\n\n'``, the pattern ``r'(?m)^\s*?$'`` will not only + match empty strings at positions 2 and 3, but also the string ``'\n'`` at + positions 2--3. To match only blank lines, the pattern should be rewritten + as ``r'(?m)^[^\S\n]*$'``. + + (Contributed by Serhiy Storchaka in :issue:`25054`.) + * :class:`tracemalloc.Traceback` frames are now sorted from oldest to most recent to be more consistent with :mod:`traceback`. (Contributed by Jesse Bakker in :issue:`32121`.) diff --git a/Lib/doctest.py b/Lib/doctest.py index 5e5bc21a038..c1d8a1db111 100644 --- a/Lib/doctest.py +++ b/Lib/doctest.py @@ -1611,7 +1611,7 @@ def check_output(self, want, got, optionflags): '', want) # If a line in got contains only spaces, then remove the # spaces. - got = re.sub(r'(?m)^\s*?$', '', got) + got = re.sub(r'(?m)^[^\S\n]+$', '', got) if got == want: return True diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index ee87446b792..2344d71abf2 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -331,21 +331,21 @@ def test_re_split(self): ['', 'a', '', '', 'c']) for sep, expected in [ - (':*', ['', 'a', 'b', 'c']), - ('(?::*)', ['', 'a', 'b', 'c']), - ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), - ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), + (':*', ['', 'a', 'b', 'c', '']), + ('(?::*)', ['', 'a', 'b', 'c', '']), + ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']), + ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']), ]: - with self.subTest(sep=sep), self.assertWarns(FutureWarning): + with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) for sep, expected in [ - ('', [':a:b::c']), - (r'\b', [':a:b::c']), - (r'(?=:)', [':a:b::c']), - (r'(?<=:)', [':a:b::c']), + ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), + (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), + (r'(?=:)', ['', ':a', ':b', ':', ':c']), + (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), ]: - with self.subTest(sep=sep), self.assertRaises(ValueError): + with self.subTest(sep=sep): self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) def test_qualified_re_split(self): @@ -356,9 +356,8 @@ def test_qualified_re_split(self): ['', ':', 'a', ':', 'b::c']) self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), ['', ':', 'a', ':', 'b::c']) - with self.assertWarns(FutureWarning): - self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), - ['', ':', 'a', ':', 'b::c']) + self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), + ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) @@ -1751,6 +1750,25 @@ def test_match_repr(self): "span=(3, 5), match='bb'>" % (type(second).__module__, type(second).__qualname__)) + def test_zerowidth(self): + # Issues 852532, 1647489, 3262, 25054. + self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) + self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', '']) + self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc']) + self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', '']) + + self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-') + self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-') + self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]') + + self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', '']) + self.assertEqual(re.findall(r"\b|\w+", "a::bc"), + ['', 'a', '', '', 'bc', '']) + + self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")], + [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) + self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], + [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) def test_bug_2537(self): # issue 2537: empty submatches diff --git a/Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst b/Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst new file mode 100644 index 00000000000..d30bdbeeb7d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-11-20-01-01-01.bpo-25054.rOlRV6.rst @@ -0,0 +1 @@ +Added support of splitting on a pattern that could match an empty string. diff --git a/Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst b/Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst new file mode 100644 index 00000000000..7c741ad0762 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-11-20-01-29-46.bpo-1647489.-ZNNkh.rst @@ -0,0 +1,3 @@ +Fixed searching regular expression patterns that could match an empty +string. Non-empty string can now be correctly found after matching an empty +string. diff --git a/Modules/_sre.c b/Modules/_sre.c index a9b6b50e84e..68fc523c251 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -446,6 +446,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->isbytes = isbytes; state->charsize = charsize; + state->match_all = 0; + state->must_advance = 0; state->beginning = ptr; @@ -559,14 +561,14 @@ pattern_dealloc(PatternObject* self) } LOCAL(Py_ssize_t) -sre_match(SRE_STATE* state, SRE_CODE* pattern, int match_all) +sre_match(SRE_STATE* state, SRE_CODE* pattern) { if (state->charsize == 1) - return sre_ucs1_match(state, pattern, match_all); + return sre_ucs1_match(state, pattern, 1); if (state->charsize == 2) - return sre_ucs2_match(state, pattern, match_all); + return sre_ucs2_match(state, pattern, 1); assert(state->charsize == 4); - return sre_ucs4_match(state, pattern, match_all); + return sre_ucs4_match(state, pattern, 1); } LOCAL(Py_ssize_t) @@ -606,7 +608,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 0); + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -645,7 +647,8 @@ _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyObject *string, TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr)); - status = sre_match(&state, PatternObject_GetCode(self), 1); + state.match_all = 1; + status = sre_match(&state, PatternObject_GetCode(self)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); if (PyErr_Occurred()) { @@ -808,11 +811,8 @@ _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string, if (status < 0) goto error; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = (state.ptr == state.start); + state.start = state.ptr; } state_fini(&state); @@ -901,17 +901,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, void* last; assert(self->codesize != 0); - if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) { - if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) { - PyErr_SetString(PyExc_ValueError, - "split() requires a non-empty pattern match."); - return NULL; - } - if (PyErr_WarnEx(PyExc_FutureWarning, - "split() requires a non-empty pattern match.", - 1) < 0) - return NULL; - } if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) return NULL; @@ -942,14 +931,6 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, goto error; } - if (state.start == state.ptr) { - if (last == state.end || state.ptr == state.end) - break; - /* skip one character */ - state.start = (void*) ((char*) state.ptr + state.charsize); - continue; - } - /* get segment before this match */ item = getslice(state.isbytes, state.beginning, string, STATE_OFFSET(&state, last), @@ -974,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string, } n = n + 1; - + state.must_advance = 1; last = state.start = state.ptr; } @@ -1101,9 +1082,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, if (status < 0) goto error; - } else if (i == b && i == e && n > 0) - /* ignore empty match on latest position */ - goto next; + } if (filter_is_callable) { /* pass match object through filter */ @@ -1130,16 +1109,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, i = e; n = n + 1; - -next: - /* move on */ - if (state.ptr == state.end) - break; - if (state.ptr == state.start) - state.start = (void*) ((char*) state.ptr + state.charsize); - else - state.start = state.ptr; - + state.must_advance = 1; + state.start = state.ptr; } /* get segment following last match */ @@ -2450,7 +2421,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) state->ptr = state->start; - status = sre_match(state, PatternObject_GetCode(self->pattern), 0); + status = sre_match(state, PatternObject_GetCode(self->pattern)); if (PyErr_Occurred()) return NULL; @@ -2459,12 +2430,10 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } @@ -2499,12 +2468,10 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self) if (status == 0) state->start = NULL; - else if (state->ptr != state->start) + else { + state->must_advance = (state->ptr == state->start); state->start = state->ptr; - else if (state->ptr != state->end) - state->start = (void*) ((char*) state->ptr + state->charsize); - else - state->start = NULL; + } return match; } diff --git a/Modules/sre.h b/Modules/sre.h index 585d2841a66..a7284881457 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -67,6 +67,7 @@ typedef struct { void* end; /* end of original string */ /* attributes for the match object */ PyObject* string; + Py_buffer buffer; Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ @@ -74,11 +75,12 @@ typedef struct { Py_ssize_t lastindex; Py_ssize_t lastmark; void** mark; + int match_all; + int must_advance; /* dynamically allocated stuff */ char* data_stack; size_t data_stack_size; size_t data_stack_base; - Py_buffer buffer; /* current repeat context */ SRE_REPEAT *repeat; } SRE_STATE; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index e13b90e8bc0..44948e21ad9 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -199,7 +199,7 @@ SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch) return up != lo && SRE(charset)(state, set, up); } -LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all); +LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel); LOCAL(Py_ssize_t) SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) @@ -510,12 +510,12 @@ do { \ #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 -#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, matchall) \ +#define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ nextctx->last_ctx_pos = ctx_pos; \ nextctx->jump = jumpvalue; \ nextctx->pattern = nextpattern; \ - nextctx->match_all = matchall; \ + nextctx->toplevel = toplevel_; \ ctx_pos = alloc_pos; \ ctx = nextctx; \ goto entrance; \ @@ -523,7 +523,7 @@ do { \ while (0) /* gcc doesn't like labels at end of scopes */ \ #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \ - DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->match_all) + DO_JUMPX(jumpvalue, jumplabel, nextpattern, ctx->toplevel) #define DO_JUMP0(jumpvalue, jumplabel, nextpattern) \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) @@ -540,13 +540,13 @@ typedef struct { SRE_CODE chr; SRE_REPEAT* rep; } u; - int match_all; + int toplevel; } SRE(match_context); /* check if string matches the given pattern. returns <0 for error, 0 for failure, and 1 for success */ LOCAL(Py_ssize_t) -SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) +SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel) { SRE_CHAR* end = (SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; @@ -563,7 +563,7 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) ctx->last_ctx_pos = -1; ctx->jump = JUMP_NONE; ctx->pattern = pattern; - ctx->match_all = match_all; + ctx->toplevel = toplevel; ctx_pos = alloc_pos; entrance: @@ -636,11 +636,14 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) case SRE_OP_SUCCESS: /* end of pattern */ TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr)); - if (!ctx->match_all || ctx->ptr == state->end) { - state->ptr = ctx->ptr; - RETURN_SUCCESS; + if (ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start))) + { + RETURN_FAILURE; } - RETURN_FAILURE; + state->ptr = ctx->ptr; + RETURN_SUCCESS; case SRE_OP_AT: /* match at given position */ @@ -856,7 +859,9 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) RETURN_FAILURE; if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - ctx->ptr == state->end) { + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -941,7 +946,10 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all) } if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS && - (!match_all || ctx->ptr == state->end)) { + !(ctx->toplevel && + ((state->match_all && ctx->ptr != state->end) || + (state->must_advance && ctx->ptr == state->start)))) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; @@ -1417,6 +1425,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; /* literal can't match: doesn't fit in char width */ #endif end = (SRE_CHAR *)state->end; + state->must_advance = 0; while (ptr < end) { while (*ptr != c) { if (++ptr >= end) @@ -1458,6 +1467,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) return 0; i = 1; + state->must_advance = 0; do { if (*ptr == (SRE_CHAR) prefix[i]) { if (++i != prefix_len) { @@ -1487,6 +1497,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (charset) { /* pattern starts with a character from a known set */ end = (SRE_CHAR *)state->end; + state->must_advance = 0; for (;;) { while (ptr < end && !SRE(charset)(state, charset, *ptr)) ptr++; @@ -1503,13 +1514,15 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) } else { /* general case */ assert(ptr <= end); - while (1) { + TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); + state->start = state->ptr = ptr; + status = SRE(match)(state, pattern, 1); + state->must_advance = 0; + while (status == 0 && ptr < end) { + ptr++; TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); state->start = state->ptr = ptr; status = SRE(match)(state, pattern, 0); - if (status != 0 || ptr >= end) - break; - ptr++; } }

1 0

Fix a regression in uuid added in bpo-32107. (#4677)
by Serhiy Storchaka 04 Dec '17

04 Dec '17

https://github.com/python/cpython/commit/e69fbb6a560a02d0587b9075afd338a1e9… commit: e69fbb6a560a02d0587b9075afd338a1e9073af0 branch: master author: Serhiy Storchaka <storchaka(a)gmail.com> committer: GitHub <noreply(a)github.com> date: 2017-12-04T11:51:55+02:00 summary: Fix a regression in uuid added in bpo-32107. (#4677) uuid.get_node() always must return a stable result. Also added a test for non-reproducibility of _random_getnode(). Original patch by Xavier de Gaye. files: M Lib/test/test_uuid.py M Lib/uuid.py diff --git a/Lib/test/test_uuid.py b/Lib/test/test_uuid.py index f113c551209..f21bd6dfa15 100644 --- a/Lib/test/test_uuid.py +++ b/Lib/test/test_uuid.py @@ -565,6 +565,9 @@ def test_random_getnode(self): self.assertTrue(node & (1 << 40), '%012x' % node) self.check_node(node) + node2 = self.uuid._random_getnode() + self.assertNotEqual(node2, node, '%012x' % node) + @unittest.skipUnless(os.name == 'posix', 'requires Posix') def test_unix_getnode(self): if not importable('_uuid') and not importable('ctypes'): diff --git a/Lib/uuid.py b/Lib/uuid.py index cb2bc092bdf..be06a6eff3f 100644 --- a/Lib/uuid.py +++ b/Lib/uuid.py @@ -674,14 +674,14 @@ def getnode(): getters = [_unix_getnode, _ifconfig_getnode, _ip_getnode, _arp_getnode, _lanscan_getnode, _netstat_getnode] - for getter in getters: + for getter in getters + [_random_getnode]: try: _node = getter() except: continue if _node is not None: return _node - return _random_getnode() + assert False, '_random_getnode() returned None' _last_timestamp = None

1 0

Daily reference leaks (4243df51fe43): sum=6
by solipsis＠pitrou.net 04 Dec '17

04 Dec '17

results for 4243df51fe43 on branch "default" -------------------------------------------- test_asyncio leaked [3, 0, 0] memory blocks, sum=3 test_functools leaked [0, 3, 1] memory blocks, sum=4 test_multiprocessing_fork leaked [-2, 1, 0] memory blocks, sum=-1 Command line was: ['./python', '-m', 'test.regrtest', '-uall', '-R', '3:3:/home/psf-users/antoine/refleaks/reflogH7fZPI', '--timeout', '7200']

1 0

bpo-27240 Rewrite the email header folding algorithm. (GH-3488) (#4693)
by R. David Murray 04 Dec '17

04 Dec '17

https://github.com/python/cpython/commit/a87ba60fe56ae2ebe80ab9ada6d280a6a1… commit: a87ba60fe56ae2ebe80ab9ada6d280a6a1f3d552 branch: 3.6 author: Miss Islington (bot) <31488909+miss-islington(a)users.noreply.github.com> committer: R. David Murray <rdmurray(a)bitdance.com> date: 2017-12-03T19:46:23-05:00 summary: bpo-27240 Rewrite the email header folding algorithm. (GH-3488) (#4693) The original algorithm tried to delegate the folding to the tokens so that those tokens whose folding rules differed could specify the differences. However, this resulted in a lot of duplicated code because most of the rules were the same. The new algorithm moves all folding logic into a set of functions external to the token classes, but puts the information about which tokens can be folded in which ways on the tokens...with the exception of mime-parameters, which are a special case (which was not even implemented in the old folder). This algorithm can still probably be improved and hopefully simplified somewhat. Note that some of the test expectations are changed. I believe the changes are toward more desirable and consistent behavior: in general when (re) folding a line the canonical version of the tokens is generated, rather than preserving errors or extra whitespace. (cherry picked from commit 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6) files: A Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst M Lib/email/_header_value_parser.py M Lib/email/headerregistry.py M Lib/test/test_email/test__header_value_parser.py M Lib/test/test_email/test_generator.py M Lib/test/test_email/test_headerregistry.py diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9b9697f7734..3ebbbe5383a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -96,90 +96,6 @@ def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' -# -# Accumulator for header folding -# - -class _Folded: - - def __init__(self, maxlen, policy): - self.maxlen = maxlen - self.policy = policy - self.lastlen = 0 - self.stickyspace = None - self.firstline = True - self.done = [] - self.current = [] - - def newline(self): - self.done.extend(self.current) - self.done.append(self.policy.linesep) - self.current.clear() - self.lastlen = 0 - - def finalize(self): - if self.current: - self.newline() - - def __str__(self): - return ''.join(self.done) - - def append(self, stoken): - self.current.append(stoken) - - def append_if_fits(self, token, stoken=None): - if stoken is None: - stoken = str(token) - l = len(stoken) - if self.stickyspace is not None: - stickyspace_len = len(self.stickyspace) - if self.lastlen + stickyspace_len + l <= self.maxlen: - self.current.append(self.stickyspace) - self.lastlen += stickyspace_len - self.current.append(stoken) - self.lastlen += l - self.stickyspace = None - self.firstline = False - return True - if token.has_fws: - ws = token.pop_leading_fws() - if ws is not None: - self.stickyspace += str(ws) - stickyspace_len += len(ws) - token._fold(self) - return True - if stickyspace_len and l + 1 <= self.maxlen: - margin = self.maxlen - l - if 0 < margin < stickyspace_len: - trim = stickyspace_len - margin - self.current.append(self.stickyspace[:trim]) - self.stickyspace = self.stickyspace[trim:] - stickyspace_len = trim - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.lastlen = l + stickyspace_len - self.stickyspace = None - self.firstline = False - return True - if not self.firstline: - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.stickyspace = None - self.firstline = False - return True - if self.lastlen + l <= self.maxlen: - self.current.append(stoken) - self.lastlen += l - return True - if l < self.maxlen: - self.newline() - self.current.append(stoken) - self.lastlen = l - return True - return False - # # TokenList and its subclasses # @@ -187,6 +103,8 @@ def append_if_fits(self, token, stoken=None): class TokenList(list): token_type = None + syntactic_break = True + ew_combine_allowed = True def __init__(self, *args, **kw): super().__init__(*args, **kw) @@ -207,84 +125,13 @@ def value(self): def all_defects(self): return sum((x.all_defects for x in self), self.defects) - # - # Folding API - # - # parts(): - # - # return a list of objects that constitute the "higher level syntactic - # objects" specified by the RFC as the best places to fold a header line. - # The returned objects must include leading folding white space, even if - # this means mutating the underlying parse tree of the object. Each object - # is only responsible for returning *its* parts, and should not drill down - # to any lower level except as required to meet the leading folding white - # space constraint. - # - # _fold(folded): - # - # folded: the result accumulator. This is an instance of _Folded. - # (XXX: I haven't finished factoring this out yet, the folding code - # pretty much uses this as a state object.) When the folded.current - # contains as much text as will fit, the _fold method should call - # folded.newline. - # folded.lastlen: the current length of the test stored in folded.current. - # folded.maxlen: The maximum number of characters that may appear on a - # folded line. Differs from the policy setting in that "no limit" is - # represented by +inf, which means it can be used in the trivially - # logical fashion in comparisons. - # - # Currently no subclasses implement parts, and I think this will remain - # true. A subclass only needs to implement _fold when the generic version - # isn't sufficient. _fold will need to be implemented primarily when it is - # possible for encoded words to appear in the specialized token-list, since - # there is no generic algorithm that can know where exactly the encoded - # words are allowed. A _fold implementation is responsible for filling - # lines in the same general way that the top level _fold does. It may, and - # should, call the _fold method of sub-objects in a similar fashion to that - # of the top level _fold. - # - # XXX: I'm hoping it will be possible to factor the existing code further - # to reduce redundancy and make the logic clearer. - - @property - def parts(self): - klass = self.__class__ - this = [] - for token in self: - if token.startswith_fws(): - if this: - yield this[0] if len(this)==1 else klass(this) - this.clear() - end_ws = token.pop_trailing_ws() - this.append(token) - if end_ws: - yield klass(this) - this = [end_ws] - if this: - yield this[0] if len(this)==1 else klass(this) - def startswith_fws(self): return self[0].startswith_fws() - def pop_leading_fws(self): - if self[0].token_type == 'fws': - return self.pop(0) - return self[0].pop_leading_fws() - - def pop_trailing_ws(self): - if self[-1].token_type == 'cfws': - return self.pop(-1) - return self[-1].pop_trailing_ws() - @property - def has_fws(self): - for part in self: - if part.has_fws: - return True - return False - - def has_leading_comment(self): - return self[0].has_leading_comment() + def as_ew_allowed(self): + """True if all top level tokens of this part may be RFC2047 encoded.""" + return all(part.as_ew_allowed for part in self) @property def comments(self): @@ -294,69 +141,13 @@ def comments(self): return comments def fold(self, *, policy): - # max_line_length 0/None means no limit, ie: infinitely long. - maxlen = policy.max_line_length or float("+inf") - folded = _Folded(maxlen, policy) - self._fold(folded) - folded.finalize() - return str(folded) - - def as_encoded_word(self, charset): - # This works only for things returned by 'parts', which include - # the leading fws, if any, that should be used. - res = [] - ws = self.pop_leading_fws() - if ws: - res.append(ws) - trailer = self.pop(-1) if self[-1].token_type=='fws' else '' - res.append(_ew.encode(str(self), charset)) - res.append(trailer) - return ''.join(res) - - def cte_encode(self, charset, policy): - res = [] - for part in self: - res.append(part.cte_encode(charset, policy)) - return ''.join(res) - - def _fold(self, folded): - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - # XXX: this should be a policy setting when utf8 is False. - charset = 'utf-8' - tstr = part.cte_encode(charset, folded.policy) - tlen = len(tstr) - if folded.append_if_fits(part, tstr): - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # There are no fold points in this one; it is too long for a single - # line and can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() + return _refold_parse_tree(self, policy=policy) def pprint(self, indent=''): - print('\n'.join(self._pp(indent=''))) + print(self.ppstr(indent=indent)) def ppstr(self, indent=''): - return '\n'.join(self._pp(indent='')) + return '\n'.join(self._pp(indent=indent)) def _pp(self, indent=''): yield '{}{}/{}('.format( @@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList): token_type = 'unstructured' - def _fold(self, folded): - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - is_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None: - # We've already done an EW, combine this one with it - # if there's room. - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - oldlastlen = sum(len(x) for x in folded.current[:last_ew]) - schunk = str(chunk) - lchunk = len(schunk) - if oldlastlen + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = oldlastlen + lchunk - continue - tstr = part.as_encoded_word(charset) - is_ew = True - if folded.append_if_fits(part, tstr): - if is_ew: - last_ew = len(folded.current) - 1 - continue - if is_ew or last_ew: - # It's too big to fit on the line, but since we've - # got encoded words we can use encoded word folding. - part._fold_as_ew(folded) - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # It can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() - last_ew = None - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - if last_ew is None: - res.append(part.cte_encode(charset, policy)) - last_ew = len(res) - else: - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res.append(tl.as_encoded_word(charset)) - return ''.join(res) - class Phrase(TokenList): token_type = 'phrase' - def _fold(self, folded): - # As with Unstructured, we can have pure ASCII with or without - # surrogateescape encoded bytes, or we could have unicode. But this - # case is more complicated, since we have to deal with the various - # sub-token types and how they can be composed in the face of - # unicode-that-needs-CTE-encoding, and the fact that if a token a - # comment that becomes a barrier across which we can't compose encoded - # words. - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - has_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None and not part.has_leading_comment(): - # We've already done an EW, let's see if we can combine - # this one with it. The last_ew logic ensures that all we - # have at this point is atoms, no comments or quoted - # strings. So we can treat the text between the last - # encoded word and the content of this token as - # unstructured text, and things will work correctly. But - # we have to strip off any trailing comment on this token - # first, and if it is a quoted string we have to pull out - # the content (we're encoding it, so it no longer needs to - # be quoted). - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - schunk = str(chunk) - lchunk = len(schunk) - if last_ew + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = sum(len(x) for x in folded.current) - continue - tstr = part.as_encoded_word(charset) - tlen = len(tstr) - has_ew = True - if folded.append_if_fits(part, tstr): - if has_ew and not part.comments: - last_ew = len(folded.current) - 1 - elif part.comments or part.token_type == 'quoted-string': - # If a comment is involved we can't combine EWs. And if a - # quoted string is involved, it's not worth the effort to - # try to combine them. - last_ew = None - continue - part._fold(folded) - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - is_ew = False - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - is_ew = True - if last_ew is None: - if not part.comments: - last_ew = len(res) - res.append(part.cte_encode(charset, policy)) - elif not part.has_leading_comment(): - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res[last_ew:] = [tl.as_encoded_word(charset)] - if part.comments or (not is_ew and part.token_type == 'quoted-string'): - last_ew = None - return ''.join(res) - class Word(TokenList): token_type = 'word' @@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList): token_type = 'cfws' - def has_leading_comment(self): - return bool(self.comments) - class Atom(TokenList): @@ -579,6 +205,7 @@ class Atom(TokenList): class Token(TokenList): token_type = 'token' + encode_as_ew = False class EncodedWord(TokenList): @@ -588,13 +215,6 @@ class EncodedWord(TokenList): charset = None lang = None - @property - def encoded(self): - if self.cte is not None: - return self.cte - _ew.encode(str(self), self.charset) - - class QuotedString(TokenList): @@ -865,6 +485,7 @@ def display_name(self): class Domain(TokenList): token_type = 'domain' + as_ew_allowed = False @property def domain(self): @@ -879,11 +500,13 @@ class DotAtom(TokenList): class DotAtomText(TokenList): token_type = 'dot-atom-text' + as_ew_allowed = True class AddrSpec(TokenList): token_type = 'addr-spec' + as_ew_allowed = False @property def local_part(self): @@ -916,11 +539,13 @@ def addr_spec(self): class ObsLocalPart(TokenList): token_type = 'obs-local-part' + as_ew_allowed = False class DisplayName(Phrase): token_type = 'display-name' + ew_combine_allowed = False @property def display_name(self): @@ -960,6 +585,7 @@ def value(self): class LocalPart(TokenList): token_type = 'local-part' + as_ew_allowed = False @property def value(self): @@ -995,6 +621,7 @@ def local_part(self): class DomainLiteral(TokenList): token_type = 'domain-literal' + as_ew_allowed = False @property def domain(self): @@ -1081,6 +708,7 @@ def stripped_value(self): class MimeParameters(TokenList): token_type = 'mime-parameters' + syntactic_break = False @property def params(self): @@ -1165,6 +793,10 @@ def __str__(self): class ParameterizedHeaderValue(TokenList): + # Set this false so that the value doesn't wind up on a new line even + # if it and the parameters would fit there but not on the first line. + syntactic_break = False + @property def params(self): for token in reversed(self): @@ -1172,18 +804,11 @@ def params(self): return token.params return {} - @property - def parts(self): - if self and self[-1].token_type == 'mime-parameters': - # We don't want to start a new line if all of the params don't fit - # after the value, so unwrap the parameter list. - return TokenList(self[:-1] + self[-1]) - return TokenList(self).parts - class ContentType(ParameterizedHeaderValue): token_type = 'content-type' + as_ew_allowed = False maintype = 'text' subtype = 'plain' @@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue): class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' + as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' + as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' + as_ew_allowed = False class Header(TokenList): token_type = 'header' - def _fold(self, folded): - folded.append(str(self.pop(0))) - folded.lastlen = len(folded.current[0]) - # The first line of the header is different from all others: we don't - # want to start a new object on a new line if it has any fold points in - # it that would allow part of it to be on the first header line. - # Further, if the first fold point would fit on the new line, we want - # to do that, but if it doesn't we want to put it on the first line. - # Folded supports this via the stickyspace attribute. If this - # attribute is not None, it does the special handling. - folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else '' - rest = self.pop(0) - if self: - raise ValueError("Malformed Header token list") - rest._fold(folded) - # # Terminal classes and instances @@ -1232,6 +844,10 @@ def _fold(self, folded): class Terminal(str): + as_ew_allowed = True + ew_combine_allowed = True + syntactic_break = True + def __new__(cls, value, token_type): self = super().__new__(cls, value) self.token_type = token_type @@ -1241,6 +857,9 @@ def __new__(cls, value, token_type): def __repr__(self): return "{}({})".format(self.__class__.__name__, super().__repr__()) + def pprint(self): + print(self.__class__.__name__ + '/' + self.token_type) + @property def all_defects(self): return list(self.defects) @@ -1254,29 +873,14 @@ def _pp(self, indent=''): '' if not self.defects else ' {}'.format(self.defects), )] - def cte_encode(self, charset, policy): - value = str(self) - try: - value.encode('us-ascii') - return value - except UnicodeEncodeError: - return _ew.encode(value, charset) - def pop_trailing_ws(self): # This terminates the recursion. return None - def pop_leading_fws(self): - # This terminates the recursion. - return None - @property def comments(self): return [] - def has_leading_comment(self): - return False - def __getnewargs__(self): return(str(self), self.token_type) @@ -1290,8 +894,6 @@ def value(self): def startswith_fws(self): return True - has_fws = True - class ValueTerminal(Terminal): @@ -1302,11 +904,6 @@ def value(self): def startswith_fws(self): return False - has_fws = False - - def as_encoded_word(self, charset): - return _ew.encode(str(self), charset) - class EWWhiteSpaceTerminal(WhiteSpaceTerminal): @@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal): def value(self): return '' - @property - def encoded(self): - return self[:] - def __str__(self): return '' - has_fws = True - # XXX these need to become classes and used as instances so # that a program can't change them in a parse tree and screw @@ -2752,7 +2343,7 @@ def get_parameter(value): if value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {!r}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if value and value[0] != "'": token, value = get_attrtext(value) @@ -2761,7 +2352,7 @@ def get_parameter(value): if not value or value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if remainder is not None: # Treat the rest of value as bare quoted string content. @@ -2966,3 +2557,255 @@ def parse_content_transfer_encoding_header(value): token, value = get_phrase(value) cte_header.append(token) return cte_header + + +# +# Header folding +# +# Header folding is complex, with lots of rules and corner cases. The +# following code does its best to obey the rules and handle the corner +# cases, but you can be sure there are few bugs:) +# +# This folder generally canonicalizes as it goes, preferring the stringified +# version of each token. The tokens contain information that supports the +# folder, including which tokens can be encoded in which ways. +# +# Folded text is accumulated in a simple list of strings ('lines'), each +# one of which should be less than policy.max_line_length ('maxlen'). +# + +def _steal_trailing_WSP_if_exists(lines): + wsp = '' + if lines and lines[-1] and lines[-1][-1] in WSP: + wsp = lines[-1][-1] + lines[-1] = lines[-1][:-1] + return wsp + +def _refold_parse_tree(parse_tree, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ + # max_line_length 0/None means no limit, ie: infinitely long. + maxlen = policy.max_line_length or float("+inf") + encoding = 'utf-8' if policy.utf8 else 'us-ascii' + lines = [''] + last_ew = None + wrap_as_ew_blocked = 0 + want_encoding = False + end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') + parts = list(parse_tree) + while parts: + part = parts.pop(0) + if part is end_ew_not_allowed: + wrap_as_ew_blocked -= 1 + continue + tstr = str(part) + try: + tstr.encode(encoding) + charset = encoding + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + charset = 'unknown-8bit' + else: + # If policy.utf8 is false this should really be taken from a + # 'charset' property on the policy. + charset = 'utf-8' + want_encoding = True + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + continue + if want_encoding and not wrap_as_ew_blocked: + if not part.as_ew_allowed: + want_encoding = False + last_ew = None + if part.syntactic_break: + encoded_part = part.fold(policy=policy)[:-1] # strip nl + if policy.linesep not in encoded_part: + # It fits on a single line + if len(encoded_part) > maxlen - len(lines[-1]): + # But not on this one, so start a new one. + newline = _steal_trailing_WSP_if_exists(lines) + # XXX what if encoded_part has no leading FWS? + lines.append(newline) + lines[-1] += encoded_part + continue + # Either this is not a major syntactic break, so we don't + # want it on a line by itself even if it fits, or it + # doesn't fit on a line by itself. Either way, fall through + # to unpacking the subparts and wrapping them. + if not hasattr(part, 'encode'): + # It's not a Terminal, do each piece individually. + parts = list(part) + parts + else: + # It's a terminal, wrap it as an encoded word, possibly + # combining it with previously encoded words if allowed. + last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, + part.ew_combine_allowed, charset) + want_encoding = False + continue + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + if not part.as_ew_allowed: + wrap_as_ew_blocked += 1 + newparts.append(end_ew_not_allowed) + parts = newparts + parts + continue + if part.as_ew_allowed and not wrap_as_ew_blocked: + # It doesn't need CTE encoding, but encode it anyway so we can + # wrap it. + parts.insert(0, part) + want_encoding = True + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep + +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): + """Fold string to_encode into lines as encoded word, combining if allowed. + Return the new value for last_ew, or None if ew_combine_allowed is False. + + If there is already an encoded word in the last line of lines (indicated by + a non-None value for last_ew) and ew_combine_allowed is true, decode the + existing ew, combine it with to_encode, and re-encode. Otherwise, encode + to_encode. In either case, split to_encode as necessary so that the + encoded segments fit within maxlen. + + """ + if last_ew is not None and ew_combine_allowed: + to_encode = str( + get_unstructured(lines[-1][last_ew:] + to_encode)) + lines[-1] = lines[-1][:last_ew] + if to_encode[0] in WSP: + # We're joining this to non-encoded text, so don't encode + # the leading blank. + leading_wsp = to_encode[0] + to_encode = to_encode[1:] + if (len(lines[-1]) == maxlen): + lines.append(_steal_trailing_WSP_if_exists(lines)) + lines[-1] += leading_wsp + trailing_wsp = '' + if to_encode[-1] in WSP: + # Likewise for the trailing space. + trailing_wsp = to_encode[-1] + to_encode = to_encode[:-1] + new_last_ew = len(lines[-1]) if last_ew is None else last_ew + while to_encode: + remaining_space = maxlen - len(lines[-1]) + # The RFC2047 chrome takes up 7 characters plus the length + # of the charset name. + encode_as = 'utf-8' if charset == 'us-ascii' else charset + text_space = remaining_space - len(encode_as) - 7 + if text_space <= 0: + lines.append(' ') + # XXX We'll get an infinite loop here if maxlen is <= 7 + continue + first_part = to_encode[:text_space] + ew = _ew.encode(first_part, charset=encode_as) + excess = len(ew) - remaining_space + if excess > 0: + # encode always chooses the shortest encoding, so this + # is guaranteed to fit at this point. + first_part = first_part[:-excess] + ew = _ew.encode(first_part) + lines[-1] += ew + to_encode = to_encode[len(first_part):] + if to_encode: + lines.append(' ') + new_last_ew = len(lines[-1]) + lines[-1] += trailing_wsp + return new_last_ew if ew_combine_allowed else None + +def _fold_mime_parameters(part, lines, maxlen, encoding): + """Fold TokenList 'part' into the 'lines' list as mime parameters. + + Using the decoded list of parameters and values, format them according to + the RFC rules, including using RFC2231 encoding if the value cannot be + expressed in 'encoding' and/or the paramter+value is too long to fit within + 'maxlen'. + + """ + # Special case for RFC2231 encoding: start from decoded values and use + # RFC2231 encoding iff needed. + # + # Note that the 1 and 2s being added to the length calculations are + # accounting for the possibly-needed spaces and semicolons we'll be adding. + # + for name, value in part.params: + # XXX What if this ';' puts us over maxlen the first time through the + # loop? We should split the header value onto a newline in that case, + # but to do that we need to recognize the need earlier or reparse the + # header, so I'm going to ignore that bug for now. It'll only put us + # one character over. + if not lines[-1].rstrip().endswith(';'): + lines[-1] += ';' + charset = encoding + error_handler = 'strict' + try: + value.encode(encoding) + encoding_required = False + except UnicodeEncodeError: + encoding_required = True + if utils._has_surrogates(value): + charset = 'unknown-8bit' + error_handler = 'surrogateescape' + else: + charset = 'utf-8' + if encoding_required: + encoded_value = urllib.parse.quote( + value, safe='', errors=error_handler) + tstr = "{}*={}''{}".format(name, charset, encoded_value) + else: + tstr = '{}={}'.format(name, quote_string(value)) + if len(lines[-1]) + len(tstr) + 1 < maxlen: + lines[-1] = lines[-1] + ' ' + tstr + continue + elif len(tstr) + 2 <= maxlen: + lines.append(' ' + tstr) + continue + # We need multiple sections. We are allowed to mix encoded and + # non-encoded sections, but we aren't going to. We'll encode them all. + section = 0 + extra_chrome = charset + "''" + while value: + chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) + if maxlen <= chrome_len + 3: + # We need room for the leading blank, the trailing semicolon, + # and at least one character of the value. If we don't + # have that, we'd be stuck, so in that case fall back to + # the RFC standard width. + maxlen = 78 + splitpoint = maxchars = maxlen - chrome_len - 2 + while True: + partial = value[:splitpoint] + encoded_value = urllib.parse.quote( + partial, safe='', errors=error_handler) + if len(encoded_value) <= maxchars: + break + splitpoint -= 1 + lines.append(" {}*{}*={}{}".format( + name, section, extra_chrome, encoded_value)) + extra_chrome = '' + section += 1 + value = value[splitpoint:] + if value: + lines[-1] += ';' diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 0fc2231e5cb..f5be87f4d24 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -245,13 +245,16 @@ def fold(self, *, policy): the header name and the ': ' separator. """ - # At some point we need to only put fws here if it was in the source. + # At some point we need to put fws here iif it was in the source. header = parser.Header([ parser.HeaderLabel([ parser.ValueTerminal(self.name, 'header-name'), parser.ValueTerminal(':', 'header-sep')]), - parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), - self._parse_tree]) + ]) + if self._parse_tree: + header.append( + parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')])) + header.append(self._parse_tree) return header.fold(policy=policy) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e0ec87d2080..1667617b9e4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -14,18 +14,7 @@ def test_EWWhiteSpaceTerminal(self): self.assertEqual(x, ' \t') self.assertEqual(str(x), '') self.assertEqual(x.value, '') - self.assertEqual(x.encoded, ' \t') - - # UnstructuredTokenList - - def test_undecodable_bytes_error_preserved(self): - badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape') - unst = parser.get_unstructured(badstr) - self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect]) - parts = list(unst.parts) - self.assertDefectsEqual(parts[0].all_defects, []) - self.assertDefectsEqual(parts[1].all_defects, []) - self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect]) + self.assertEqual(x.token_type, 'fws') class TestParserMixin: @@ -139,7 +128,6 @@ def test_get_encoded_word_sets_extra_attributes(self): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, 'jive') @@ -150,7 +138,6 @@ def test_get_encoded_word_lang_default_is_blank(self): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, '') @@ -2700,28 +2687,37 @@ def test_address_list_with_unicode_names_in_quotes(self): # and with unicode tokens in the comments. Spaces inside the quotes # currently don't do the right thing. - def test_initial_whitespace_splitting(self): + def test_split_at_whitespace_after_header_before_long_token(self): body = parser.get_unstructured(' ' + 'x'*77) header = parser.Header([ parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]), parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body]) self._test(header, 'test: \n ' + 'x'*77 + '\n') - def test_whitespace_splitting(self): + def test_split_at_whitespace_before_long_token(self): self._test(parser.get_unstructured('xxx ' + 'y'*77), 'xxx \n ' + 'y'*77 + '\n') + def test_overlong_encodeable_is_wrapped(self): + first_token_with_whitespace = 'xxx ' + chrome_leader = '=?utf-8?q?' + len_chrome = len(chrome_leader) + 2 + len_non_y = len_chrome + len(first_token_with_whitespace) + self._test(parser.get_unstructured(first_token_with_whitespace + + 'y'*80), + first_token_with_whitespace + chrome_leader + + 'y'*(78-len_non_y) + '?=\n' + + ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n') + def test_long_filename_attachment(self): - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n', - folded - ) - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n', - folded - ) + self._test(parser.parse_content_disposition_header( + 'attachment; filename="TEST_TEST_TEST_TEST' + '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'), + "attachment;\n" + " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST" + "_TEST_TEST_TEST_TEST_TEST;\n" + " filename*1*=_TEST_TES.txt\n", + ) if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c4f182903af..c1aeaefab77 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -27,7 +27,6 @@ def msgmaker(self, msg, policy=None): None """), - # From is wrapped because wrapped it fits in 40. 40: textwrap.dedent("""\ To: whom_it_may_concern(a)example.com From: @@ -40,11 +39,11 @@ def msgmaker(self, msg, policy=None): None """), - # Neither to nor from fit even if put on a new line, - # so we leave them sticking out on the first line. 20: textwrap.dedent("""\ - To: whom_it_may_concern(a)example.com - From: nobody_you_want_to_know(a)example.com + To: + whom_it_may_concern(a)example.com + From: + nobody_you_want_to_know(a)example.com Subject: We the willing led by the unknowing are doing @@ -169,6 +168,53 @@ def test_compat32_max_line_length_does_not_fold_when_none(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0])) + def test_rfc2231_wrapping(self): + # This is pretty much just to make sure we don't have an infinite + # loop; I don't expect anyone to hit this in the field. + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename*0*=us-ascii''afilename; + filename*1*=longenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=33)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self): + # This is just to make sure we don't have an infinite loop; I don't + # expect anyone to hit this in the field, so I'm not bothering to make + # the result optimal (the encoding isn't needed). + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: + attachment; + filename*0*=us-ascii''afilenamelongenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=20)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index af836dc9726..30ce0ba54e4 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -229,14 +229,14 @@ def content_type_as_value(self, defects = args[1] if l>1 else [] decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' - folded = args[3] if l>3 else header + source + '\n' + folded = args[3] if l>3 else header + decoded + '\n' h = self.make_header('Content-Type', source) self.assertEqual(h.content_type, content_type) self.assertEqual(h.maintype, maintype) self.assertEqual(h.subtype, subtype) self.assertEqual(h.params, parmdict) with self.assertRaises(TypeError): - h.params['abc'] = 'xyz' # params is read-only. + h.params['abc'] = 'xyz' # make sure params is read-only. self.assertDefectsEqual(h.defects, defects) self.assertEqual(h, decoded) self.assertEqual(h.fold(policy=policy.default), folded) @@ -373,9 +373,10 @@ def content_type_as_value(self, 'text/plain; Charset="utf-8"'), # Since this is pretty much the ur-mimeheader, we'll put all the tests - # that exercise the parameter parsing and formatting here. - # - # XXX: question: is minimal quoting preferred? + # that exercise the parameter parsing and formatting here. Note that + # when we refold we may canonicalize, so things like whitespace, + # quoting, and rfc2231 encoding may change from what was in the input + # header. 'unquoted_param_value': ( 'text/plain; title=foo', @@ -384,7 +385,8 @@ def content_type_as_value(self, 'plain', {'title': 'foo'}, [], - 'text/plain; title="foo"'), + 'text/plain; title="foo"', + ), 'param_value_with_tspecials': ( 'text/plain; title="(bar)foo blue"', @@ -415,7 +417,8 @@ def content_type_as_value(self, 'mixed', {'boundary': 'CPIMSSMTPC06p5f3tG'}, [], - 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'), + 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"', + ), 'spaces_around_semis': ( ('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; ' @@ -429,14 +432,31 @@ def content_type_as_value(self, [], ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' 'x-mac-creator="474B4F4E"'), - # XXX: it could be that we will eventually prefer to fold starting - # from the decoded value, in which case these spaces and similar - # spaces in other tests will be wrong. - ('Content-Type: image/jpeg; name="wibble.JPG" ; ' - 'x-mac-type="4A504547" ;\n' + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' ' x-mac-creator="474B4F4E"\n'), ), + 'lots_of_mime_params': ( + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + 'image/jpeg', + 'image', + 'jpeg', + {'name': 'wibble.JPG', + 'x-mac-type': '4A504547', + 'x-mac-creator': '474B4F4E', + 'x-extrastuff': 'make it longer'}, + [], + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + # In this case the whole of the MimeParameters does *not* fit + # one one line, so we break at a lower syntactic level. + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' + ' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'), + ), + 'semis_inside_quotes': ( 'image/jpeg; name="Jim&&Jill"', 'image/jpeg', @@ -460,19 +480,25 @@ def content_type_as_value(self, [], r'image/jpeg; name="Jim \"Bob\" Jill"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_in_params': ( - # ('foo\xa7/bar; b\xa7r=two; ' - # 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', - # 'surrogateescape')), - # 'foo\uFFFD/bar', - # 'foo\uFFFD', - # 'bar', - # {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, - # [errors.UndecodableBytesDefect]*3, - # 'foo�/bar; b�r="two"; baz="thr�e"', - # ), + 'non_ascii_in_params': ( + ('foo\xa7/bar; b\xa7r=two; ' + 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', + 'surrogateescape')), + 'foo\uFFFD/bar', + 'foo\uFFFD', + 'bar', + {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, + [errors.UndecodableBytesDefect]*3, + 'foo�/bar; b�r="two"; baz="thr�e"', + # XXX Two bugs here: the mime type is not allowed to be an encoded + # word, and we shouldn't be emitting surrogates in the parameter + # names. But I don't know what the behavior should be here, so I'm + # punting for now. In practice this is unlikely to be encountered + # since headers with binary in them only come from a binary source + # and are almost certain to be re-emitted without refolding. + 'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n' + " baz*=unknown-8bit''thr%A7e\n", + ), # RFC 2231 parameter tests. @@ -494,19 +520,20 @@ def content_type_as_value(self, [], r'image/jpeg; bar="baz\"foobar\"baz"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_rfc2231_value': ( - # ('text/plain; charset=us-ascii; ' - # "title*=us-ascii'en'This%20is%20" - # 'not%20f\xa7n').encode('latin-1').decode('us-ascii', - # 'surrogateescape'), - # 'text/plain', - # 'text', - # 'plain', - # {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, - # [errors.UndecodableBytesDefect], - # 'text/plain; charset="us-ascii"; title="This is not f�n"'), + 'non_ascii_rfc2231_value': ( + ('text/plain; charset=us-ascii; ' + "title*=us-ascii'en'This%20is%20" + 'not%20f\xa7n').encode('latin-1').decode('us-ascii', + 'surrogateescape'), + 'text/plain', + 'text', + 'plain', + {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, + [errors.UndecodableBytesDefect], + 'text/plain; charset="us-ascii"; title="This is not f�n"', + 'Content-Type: text/plain; charset="us-ascii";\n' + " title*=unknown-8bit''This%20is%20not%20f%A7n\n", + ), 'rfc2231_encoded_charset': ( 'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii', @@ -529,8 +556,6 @@ def content_type_as_value(self, {'name': 'This is ***fun*** is it not.pdf'}, [], 'text/plain; name="This is ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n' - '\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'), ), # Make sure we also handle it if there are spurious double quotes. @@ -545,9 +570,6 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it not.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'), ), 'rfc2231_single_quote_inside_double_quotes': ( @@ -562,9 +584,8 @@ def content_type_as_value(self, [errors.InvalidHeaderDefect]*2, ('text/plain; charset="us-ascii"; ' 'title="This is really ***fun*** isn\'t it!"'), - ('Content-Type: text/plain; charset=us-ascii;\n' - '\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n' - '\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'), + ('Content-Type: text/plain; charset="us-ascii";\n' + ' title="This is really ***fun*** isn\'t it!"\n'), ), 'rfc2231_single_quote_in_value_with_charset_and_lang': ( @@ -576,9 +597,6 @@ def content_type_as_value(self, {'name': "Frank's Document"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'Frank\'s";\n' - ' name*1*=" Document"\n'), ), 'rfc2231_single_quote_in_non_encoded_value': ( @@ -590,9 +608,6 @@ def content_type_as_value(self, {'name': "us-ascii'en-us'Frank's Document"}, [], 'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'Frank\'s";\n' - ' name*1=" Document"\n'), ), 'rfc2231_no_language_or_charset': ( @@ -615,12 +630,8 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), - # XXX: see below...the first name line here should be *0 not *0*. 'rfc2231_partly_encoded': ( ("text/plain;" '\tname*0*="\'\'This%20is%20even%20more%20";' @@ -632,9 +643,6 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), 'rfc2231_partly_encoded_2': ( @@ -647,10 +655,11 @@ def content_type_as_value(self, 'plain', {'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'}, [errors.InvalidHeaderDefect], - 'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), + ('text/plain;' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'), + ('Content-Type: text/plain;\n' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is' + ' it.pdf"\n'), ), 'rfc2231_unknown_charset_treated_as_ascii': ( @@ -669,9 +678,12 @@ def content_type_as_value(self, 'plain', {'charset': 'utf-8\uFFFD\uFFFD\uFFFD'}, [errors.UndecodableBytesDefect], - 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'), + 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"', + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%F1%F2%F3\n", + ), - 'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': ( + 'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': ( "text/plain; charset*=ascii''utf-8%E2%80%9D", 'text/plain', 'text', @@ -679,9 +691,11 @@ def content_type_as_value(self, {'charset': 'utf-8”'}, [errors.UndecodableBytesDefect], 'text/plain; charset="utf-8”"', + # XXX Should folding change the charset to utf8? Currently it just + # reproduces the original, which is arguably fine. + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%E2%80%9D\n", ), - # XXX: if the above were *re*folded, it would get tagged as utf-8 - # instead of ascii in the param, since it now contains non-ASCII. 'rfc2231_encoded_then_unencoded_segments': ( ('application/x-foo;' @@ -694,9 +708,6 @@ def content_type_as_value(self, {'name': 'My Document For You'}, [errors.InvalidHeaderDefect], 'application/x-foo; name="My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'My";\n' - '\tname*1=" Document";\tname*2=" For You"\n'), ), # My reading of the RFC is that this is an invalid header. The RFC @@ -713,11 +724,6 @@ def content_type_as_value(self, {'name': 'My Document For You'}, [errors.InvalidHeaderDefect]*3, 'application/x-foo; name="My Document For You"', - ("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t" - # XXX: the newline is in the wrong place, come back and fix - # this when the rest of tests pass. - 'name*1*=" Document"\n;' - '\tname*2*=" For You"\n'), ), # XXX: I would say this one should default to ascii/en for the @@ -730,8 +736,7 @@ def content_type_as_value(self, # charset'lang'value pattern exactly *and* there is at least one # encoded segment. Implementing that algorithm will require some # refactoring, so I haven't done it (yet). - - 'rfc2231_qouted_unencoded_then_encoded_segments': ( + 'rfc2231_quoted_unencoded_then_encoded_segments': ( ('application/x-foo;' '\tname*0="us-ascii\'en-us\'My";' '\tname*1*=" Document";' @@ -742,9 +747,25 @@ def content_type_as_value(self, {'name': "us-ascii'en-us'My Document For You"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="us-ascii\'en-us\'My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'My";\n' - '\tname*1*=" Document";\tname*2*=" For You"\n'), + ), + + # Make sure our folding algorithm produces multiple sections correctly. + # We could mix encoded and non-encoded segments, but we don't, we just + # make them all encoded. It might be worth fixing that, since the + # sections can get used for wrapping ascii text. + 'rfc2231_folded_segments_correctly_formatted': ( + ('application/x-foo;' + '\tname="' + "with spaces"*8 + '"'), + 'application/x-foo', + 'application', + 'x-foo', + {'name': "with spaces"*8}, + [], + 'application/x-foo; name="' + "with spaces"*8 + '"', + "Content-Type: application/x-foo;\n" + " name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith" + "%20spaceswith;\n" + " name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n" ), } @@ -827,8 +848,8 @@ def content_disp_as_value(self, [], ('attachment; filename="genome.jpeg"; ' 'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'), - ('Content-Disposition: attachment; filename=genome.jpeg;\n' - ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'), + ('Content-Disposition: attachment; filename="genome.jpeg";\n' + ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'), ), 'no_value': ( @@ -873,7 +894,7 @@ def version_string_as_MIME_Version(self, if source: source = ' ' + source self.assertEqual(h.fold(policy=policy.default), - 'MIME-Version:' + source + '\n') + 'MIME-Version:' + source + '\n') version_string_params = { @@ -1546,15 +1567,39 @@ def test_fold_unstructured_with_overlong_word(self): 'singlewordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) def test_fold_unstructured_with_two_overlong_words(self): h = self.make_header('Subject', 'thisisaverylonglineconsistingofa' 'singlewordthatwontfit plusanotherverylongwordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n' - ' plusanotherverylongwordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit_pl?=\n' + ' =?utf-8?q?usanoth?=\n' + ' =?utf-8?q?erveryl?=\n' + ' =?utf-8?q?ongword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) + + # XXX Need test for when max_line_length is less than the chrome size. def test_fold_unstructured_with_slightly_long_word(self): h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen') @@ -1590,6 +1635,18 @@ def test_fold_date_header(self): self.assertEqual(h.fold(policy=policy.default), 'Date: Sat, 02 Feb 2002 17:00:06 -0800\n') + def test_fold_overlong_words_using_RFC2047(self): + h = self.make_header( + 'X-Report-Abuse', + '<https://www.mailitapp.com/report_abuse.php?' + 'mid=xxx-xxx-xxxxxxxxxxxxxxxxxxxxxxxx==-xxx-xx-xx>') + self.assertEqual( + h.fold(policy=policy.default), + 'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E' + 'com/report=5F?=\n' + ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx' + 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n' + ' =?utf-8?q?xx-xx=3E?=\n') if __name__ == '__main__': diff --git a/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst new file mode 100644 index 00000000000..c933ee7d916 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst @@ -0,0 +1,3 @@ +The header folding algorithm for the new email policies has been rewritten, +which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231 +folding is now done correctly.

1 0

bpo-27240 Rewrite the email header folding algorithm. (#3488)
by R. David Murray 03 Dec '17

03 Dec '17

https://github.com/python/cpython/commit/85d5c18c9d83a1d54eecc4c2ad4dce6319… commit: 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6 branch: master author: R. David Murray <rdmurray(a)bitdance.com> committer: GitHub <noreply(a)github.com> date: 2017-12-03T18:51:41-05:00 summary: bpo-27240 Rewrite the email header folding algorithm. (#3488) The original algorithm tried to delegate the folding to the tokens so that those tokens whose folding rules differed could specify the differences. However, this resulted in a lot of duplicated code because most of the rules were the same. The new algorithm moves all folding logic into a set of functions external to the token classes, but puts the information about which tokens can be folded in which ways on the tokens...with the exception of mime-parameters, which are a special case (which was not even implemented in the old folder). This algorithm can still probably be improved and hopefully simplified somewhat. Note that some of the test expectations are changed. I believe the changes are toward more desirable and consistent behavior: in general when (re) folding a line the canonical version of the tokens is generated, rather than preserving errors or extra whitespace. files: A Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst M Lib/email/_header_value_parser.py M Lib/email/headerregistry.py M Lib/test/test_email/test__header_value_parser.py M Lib/test/test_email/test_generator.py M Lib/test/test_email/test_headerregistry.py diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index b4737c806e1..b34c58bf85d 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -96,90 +96,6 @@ def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' -# -# Accumulator for header folding -# - -class _Folded: - - def __init__(self, maxlen, policy): - self.maxlen = maxlen - self.policy = policy - self.lastlen = 0 - self.stickyspace = None - self.firstline = True - self.done = [] - self.current = [] - - def newline(self): - self.done.extend(self.current) - self.done.append(self.policy.linesep) - self.current.clear() - self.lastlen = 0 - - def finalize(self): - if self.current: - self.newline() - - def __str__(self): - return ''.join(self.done) - - def append(self, stoken): - self.current.append(stoken) - - def append_if_fits(self, token, stoken=None): - if stoken is None: - stoken = str(token) - l = len(stoken) - if self.stickyspace is not None: - stickyspace_len = len(self.stickyspace) - if self.lastlen + stickyspace_len + l <= self.maxlen: - self.current.append(self.stickyspace) - self.lastlen += stickyspace_len - self.current.append(stoken) - self.lastlen += l - self.stickyspace = None - self.firstline = False - return True - if token.has_fws: - ws = token.pop_leading_fws() - if ws is not None: - self.stickyspace += str(ws) - stickyspace_len += len(ws) - token._fold(self) - return True - if stickyspace_len and l + 1 <= self.maxlen: - margin = self.maxlen - l - if 0 < margin < stickyspace_len: - trim = stickyspace_len - margin - self.current.append(self.stickyspace[:trim]) - self.stickyspace = self.stickyspace[trim:] - stickyspace_len = trim - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.lastlen = l + stickyspace_len - self.stickyspace = None - self.firstline = False - return True - if not self.firstline: - self.newline() - self.current.append(self.stickyspace) - self.current.append(stoken) - self.stickyspace = None - self.firstline = False - return True - if self.lastlen + l <= self.maxlen: - self.current.append(stoken) - self.lastlen += l - return True - if l < self.maxlen: - self.newline() - self.current.append(stoken) - self.lastlen = l - return True - return False - # # TokenList and its subclasses # @@ -187,6 +103,8 @@ def append_if_fits(self, token, stoken=None): class TokenList(list): token_type = None + syntactic_break = True + ew_combine_allowed = True def __init__(self, *args, **kw): super().__init__(*args, **kw) @@ -207,84 +125,13 @@ def value(self): def all_defects(self): return sum((x.all_defects for x in self), self.defects) - # - # Folding API - # - # parts(): - # - # return a list of objects that constitute the "higher level syntactic - # objects" specified by the RFC as the best places to fold a header line. - # The returned objects must include leading folding white space, even if - # this means mutating the underlying parse tree of the object. Each object - # is only responsible for returning *its* parts, and should not drill down - # to any lower level except as required to meet the leading folding white - # space constraint. - # - # _fold(folded): - # - # folded: the result accumulator. This is an instance of _Folded. - # (XXX: I haven't finished factoring this out yet, the folding code - # pretty much uses this as a state object.) When the folded.current - # contains as much text as will fit, the _fold method should call - # folded.newline. - # folded.lastlen: the current length of the test stored in folded.current. - # folded.maxlen: The maximum number of characters that may appear on a - # folded line. Differs from the policy setting in that "no limit" is - # represented by +inf, which means it can be used in the trivially - # logical fashion in comparisons. - # - # Currently no subclasses implement parts, and I think this will remain - # true. A subclass only needs to implement _fold when the generic version - # isn't sufficient. _fold will need to be implemented primarily when it is - # possible for encoded words to appear in the specialized token-list, since - # there is no generic algorithm that can know where exactly the encoded - # words are allowed. A _fold implementation is responsible for filling - # lines in the same general way that the top level _fold does. It may, and - # should, call the _fold method of sub-objects in a similar fashion to that - # of the top level _fold. - # - # XXX: I'm hoping it will be possible to factor the existing code further - # to reduce redundancy and make the logic clearer. - - @property - def parts(self): - klass = self.__class__ - this = [] - for token in self: - if token.startswith_fws(): - if this: - yield this[0] if len(this)==1 else klass(this) - this.clear() - end_ws = token.pop_trailing_ws() - this.append(token) - if end_ws: - yield klass(this) - this = [end_ws] - if this: - yield this[0] if len(this)==1 else klass(this) - def startswith_fws(self): return self[0].startswith_fws() - def pop_leading_fws(self): - if self[0].token_type == 'fws': - return self.pop(0) - return self[0].pop_leading_fws() - - def pop_trailing_ws(self): - if self[-1].token_type == 'cfws': - return self.pop(-1) - return self[-1].pop_trailing_ws() - @property - def has_fws(self): - for part in self: - if part.has_fws: - return True - return False - - def has_leading_comment(self): - return self[0].has_leading_comment() + def as_ew_allowed(self): + """True if all top level tokens of this part may be RFC2047 encoded.""" + return all(part.as_ew_allowed for part in self) @property def comments(self): @@ -294,69 +141,13 @@ def comments(self): return comments def fold(self, *, policy): - # max_line_length 0/None means no limit, ie: infinitely long. - maxlen = policy.max_line_length or float("+inf") - folded = _Folded(maxlen, policy) - self._fold(folded) - folded.finalize() - return str(folded) - - def as_encoded_word(self, charset): - # This works only for things returned by 'parts', which include - # the leading fws, if any, that should be used. - res = [] - ws = self.pop_leading_fws() - if ws: - res.append(ws) - trailer = self.pop(-1) if self[-1].token_type=='fws' else '' - res.append(_ew.encode(str(self), charset)) - res.append(trailer) - return ''.join(res) - - def cte_encode(self, charset, policy): - res = [] - for part in self: - res.append(part.cte_encode(charset, policy)) - return ''.join(res) - - def _fold(self, folded): - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - # XXX: this should be a policy setting when utf8 is False. - charset = 'utf-8' - tstr = part.cte_encode(charset, folded.policy) - tlen = len(tstr) - if folded.append_if_fits(part, tstr): - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # There are no fold points in this one; it is too long for a single - # line and can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() + return _refold_parse_tree(self, policy=policy) def pprint(self, indent=''): - print('\n'.join(self._pp(indent=''))) + print(self.ppstr(indent=indent)) def ppstr(self, indent=''): - return '\n'.join(self._pp(indent='')) + return '\n'.join(self._pp(indent=indent)) def _pp(self, indent=''): yield '{}{}/{}('.format( @@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList): token_type = 'unstructured' - def _fold(self, folded): - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - is_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None: - # We've already done an EW, combine this one with it - # if there's room. - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - oldlastlen = sum(len(x) for x in folded.current[:last_ew]) - schunk = str(chunk) - lchunk = len(schunk) - if oldlastlen + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = oldlastlen + lchunk - continue - tstr = part.as_encoded_word(charset) - is_ew = True - if folded.append_if_fits(part, tstr): - if is_ew: - last_ew = len(folded.current) - 1 - continue - if is_ew or last_ew: - # It's too big to fit on the line, but since we've - # got encoded words we can use encoded word folding. - part._fold_as_ew(folded) - continue - # Peel off the leading whitespace if any and make it sticky, to - # avoid infinite recursion. - ws = part.pop_leading_fws() - if ws is not None: - folded.stickyspace = str(ws) - if folded.append_if_fits(part): - continue - if part.has_fws: - part._fold(folded) - continue - # It can't be split...we just have to put it on its own line. - folded.append(tstr) - folded.newline() - last_ew = None - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - if last_ew is None: - res.append(part.cte_encode(charset, policy)) - last_ew = len(res) - else: - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res.append(tl.as_encoded_word(charset)) - return ''.join(res) - class Phrase(TokenList): token_type = 'phrase' - def _fold(self, folded): - # As with Unstructured, we can have pure ASCII with or without - # surrogateescape encoded bytes, or we could have unicode. But this - # case is more complicated, since we have to deal with the various - # sub-token types and how they can be composed in the face of - # unicode-that-needs-CTE-encoding, and the fact that if a token a - # comment that becomes a barrier across which we can't compose encoded - # words. - last_ew = None - encoding = 'utf-8' if folded.policy.utf8 else 'ascii' - for part in self.parts: - tstr = str(part) - tlen = len(tstr) - has_ew = False - try: - str(part).encode(encoding) - except UnicodeEncodeError: - if any(isinstance(x, errors.UndecodableBytesDefect) - for x in part.all_defects): - charset = 'unknown-8bit' - else: - charset = 'utf-8' - if last_ew is not None and not part.has_leading_comment(): - # We've already done an EW, let's see if we can combine - # this one with it. The last_ew logic ensures that all we - # have at this point is atoms, no comments or quoted - # strings. So we can treat the text between the last - # encoded word and the content of this token as - # unstructured text, and things will work correctly. But - # we have to strip off any trailing comment on this token - # first, and if it is a quoted string we have to pull out - # the content (we're encoding it, so it no longer needs to - # be quoted). - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - chunk = get_unstructured( - ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) - schunk = str(chunk) - lchunk = len(schunk) - if last_ew + lchunk <= folded.maxlen: - del folded.current[last_ew:] - folded.append(schunk) - folded.lastlen = sum(len(x) for x in folded.current) - continue - tstr = part.as_encoded_word(charset) - tlen = len(tstr) - has_ew = True - if folded.append_if_fits(part, tstr): - if has_ew and not part.comments: - last_ew = len(folded.current) - 1 - elif part.comments or part.token_type == 'quoted-string': - # If a comment is involved we can't combine EWs. And if a - # quoted string is involved, it's not worth the effort to - # try to combine them. - last_ew = None - continue - part._fold(folded) - - def cte_encode(self, charset, policy): - res = [] - last_ew = None - is_ew = False - for part in self: - spart = str(part) - try: - spart.encode('us-ascii') - res.append(spart) - except UnicodeEncodeError: - is_ew = True - if last_ew is None: - if not part.comments: - last_ew = len(res) - res.append(part.cte_encode(charset, policy)) - elif not part.has_leading_comment(): - if part[-1].token_type == 'cfws' and part.comments: - remainder = part.pop(-1) - else: - remainder = '' - for i, token in enumerate(part): - if token.token_type == 'bare-quoted-string': - part[i] = UnstructuredTokenList(token[:]) - tl = get_unstructured(''.join(res[last_ew:] + [spart])) - res[last_ew:] = [tl.as_encoded_word(charset)] - if part.comments or (not is_ew and part.token_type == 'quoted-string'): - last_ew = None - return ''.join(res) - class Word(TokenList): token_type = 'word' @@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList): token_type = 'cfws' - def has_leading_comment(self): - return bool(self.comments) - class Atom(TokenList): @@ -579,6 +205,7 @@ class Atom(TokenList): class Token(TokenList): token_type = 'token' + encode_as_ew = False class EncodedWord(TokenList): @@ -588,13 +215,6 @@ class EncodedWord(TokenList): charset = None lang = None - @property - def encoded(self): - if self.cte is not None: - return self.cte - _ew.encode(str(self), self.charset) - - class QuotedString(TokenList): @@ -865,6 +485,7 @@ def display_name(self): class Domain(TokenList): token_type = 'domain' + as_ew_allowed = False @property def domain(self): @@ -879,11 +500,13 @@ class DotAtom(TokenList): class DotAtomText(TokenList): token_type = 'dot-atom-text' + as_ew_allowed = True class AddrSpec(TokenList): token_type = 'addr-spec' + as_ew_allowed = False @property def local_part(self): @@ -916,11 +539,13 @@ def addr_spec(self): class ObsLocalPart(TokenList): token_type = 'obs-local-part' + as_ew_allowed = False class DisplayName(Phrase): token_type = 'display-name' + ew_combine_allowed = False @property def display_name(self): @@ -960,6 +585,7 @@ def value(self): class LocalPart(TokenList): token_type = 'local-part' + as_ew_allowed = False @property def value(self): @@ -995,6 +621,7 @@ def local_part(self): class DomainLiteral(TokenList): token_type = 'domain-literal' + as_ew_allowed = False @property def domain(self): @@ -1081,6 +708,7 @@ def stripped_value(self): class MimeParameters(TokenList): token_type = 'mime-parameters' + syntactic_break = False @property def params(self): @@ -1165,6 +793,10 @@ def __str__(self): class ParameterizedHeaderValue(TokenList): + # Set this false so that the value doesn't wind up on a new line even + # if it and the parameters would fit there but not on the first line. + syntactic_break = False + @property def params(self): for token in reversed(self): @@ -1172,18 +804,11 @@ def params(self): return token.params return {} - @property - def parts(self): - if self and self[-1].token_type == 'mime-parameters': - # We don't want to start a new line if all of the params don't fit - # after the value, so unwrap the parameter list. - return TokenList(self[:-1] + self[-1]) - return TokenList(self).parts - class ContentType(ParameterizedHeaderValue): token_type = 'content-type' + as_ew_allowed = False maintype = 'text' subtype = 'plain' @@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue): class ContentDisposition(ParameterizedHeaderValue): token_type = 'content-disposition' + as_ew_allowed = False content_disposition = None class ContentTransferEncoding(TokenList): token_type = 'content-transfer-encoding' + as_ew_allowed = False cte = '7bit' class HeaderLabel(TokenList): token_type = 'header-label' + as_ew_allowed = False class Header(TokenList): token_type = 'header' - def _fold(self, folded): - folded.append(str(self.pop(0))) - folded.lastlen = len(folded.current[0]) - # The first line of the header is different from all others: we don't - # want to start a new object on a new line if it has any fold points in - # it that would allow part of it to be on the first header line. - # Further, if the first fold point would fit on the new line, we want - # to do that, but if it doesn't we want to put it on the first line. - # Folded supports this via the stickyspace attribute. If this - # attribute is not None, it does the special handling. - folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else '' - rest = self.pop(0) - if self: - raise ValueError("Malformed Header token list") - rest._fold(folded) - # # Terminal classes and instances @@ -1232,6 +844,10 @@ def _fold(self, folded): class Terminal(str): + as_ew_allowed = True + ew_combine_allowed = True + syntactic_break = True + def __new__(cls, value, token_type): self = super().__new__(cls, value) self.token_type = token_type @@ -1241,6 +857,9 @@ def __new__(cls, value, token_type): def __repr__(self): return "{}({})".format(self.__class__.__name__, super().__repr__()) + def pprint(self): + print(self.__class__.__name__ + '/' + self.token_type) + @property def all_defects(self): return list(self.defects) @@ -1254,29 +873,14 @@ def _pp(self, indent=''): '' if not self.defects else ' {}'.format(self.defects), )] - def cte_encode(self, charset, policy): - value = str(self) - try: - value.encode('us-ascii') - return value - except UnicodeEncodeError: - return _ew.encode(value, charset) - def pop_trailing_ws(self): # This terminates the recursion. return None - def pop_leading_fws(self): - # This terminates the recursion. - return None - @property def comments(self): return [] - def has_leading_comment(self): - return False - def __getnewargs__(self): return(str(self), self.token_type) @@ -1290,8 +894,6 @@ def value(self): def startswith_fws(self): return True - has_fws = True - class ValueTerminal(Terminal): @@ -1302,11 +904,6 @@ def value(self): def startswith_fws(self): return False - has_fws = False - - def as_encoded_word(self, charset): - return _ew.encode(str(self), charset) - class EWWhiteSpaceTerminal(WhiteSpaceTerminal): @@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal): def value(self): return '' - @property - def encoded(self): - return self[:] - def __str__(self): return '' - has_fws = True - # XXX these need to become classes and used as instances so # that a program can't change them in a parse tree and screw @@ -2751,7 +2342,7 @@ def get_parameter(value): if value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {!r}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if value and value[0] != "'": token, value = get_attrtext(value) @@ -2760,7 +2351,7 @@ def get_parameter(value): if not value or value[0] != "'": raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " "delimiter, but found {}".format(value)) - appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) + appendto.append(ValueTerminal("'", 'RFC2231-delimiter')) value = value[1:] if remainder is not None: # Treat the rest of value as bare quoted string content. @@ -2965,3 +2556,255 @@ def parse_content_transfer_encoding_header(value): token, value = get_phrase(value) cte_header.append(token) return cte_header + + +# +# Header folding +# +# Header folding is complex, with lots of rules and corner cases. The +# following code does its best to obey the rules and handle the corner +# cases, but you can be sure there are few bugs:) +# +# This folder generally canonicalizes as it goes, preferring the stringified +# version of each token. The tokens contain information that supports the +# folder, including which tokens can be encoded in which ways. +# +# Folded text is accumulated in a simple list of strings ('lines'), each +# one of which should be less than policy.max_line_length ('maxlen'). +# + +def _steal_trailing_WSP_if_exists(lines): + wsp = '' + if lines and lines[-1] and lines[-1][-1] in WSP: + wsp = lines[-1][-1] + lines[-1] = lines[-1][:-1] + return wsp + +def _refold_parse_tree(parse_tree, *, policy): + """Return string of contents of parse_tree folded according to RFC rules. + + """ + # max_line_length 0/None means no limit, ie: infinitely long. + maxlen = policy.max_line_length or float("+inf") + encoding = 'utf-8' if policy.utf8 else 'us-ascii' + lines = [''] + last_ew = None + wrap_as_ew_blocked = 0 + want_encoding = False + end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') + parts = list(parse_tree) + while parts: + part = parts.pop(0) + if part is end_ew_not_allowed: + wrap_as_ew_blocked -= 1 + continue + tstr = str(part) + try: + tstr.encode(encoding) + charset = encoding + except UnicodeEncodeError: + if any(isinstance(x, errors.UndecodableBytesDefect) + for x in part.all_defects): + charset = 'unknown-8bit' + else: + # If policy.utf8 is false this should really be taken from a + # 'charset' property on the policy. + charset = 'utf-8' + want_encoding = True + if part.token_type == 'mime-parameters': + # Mime parameter folding (using RFC2231) is extra special. + _fold_mime_parameters(part, lines, maxlen, encoding) + continue + if want_encoding and not wrap_as_ew_blocked: + if not part.as_ew_allowed: + want_encoding = False + last_ew = None + if part.syntactic_break: + encoded_part = part.fold(policy=policy)[:-1] # strip nl + if policy.linesep not in encoded_part: + # It fits on a single line + if len(encoded_part) > maxlen - len(lines[-1]): + # But not on this one, so start a new one. + newline = _steal_trailing_WSP_if_exists(lines) + # XXX what if encoded_part has no leading FWS? + lines.append(newline) + lines[-1] += encoded_part + continue + # Either this is not a major syntactic break, so we don't + # want it on a line by itself even if it fits, or it + # doesn't fit on a line by itself. Either way, fall through + # to unpacking the subparts and wrapping them. + if not hasattr(part, 'encode'): + # It's not a Terminal, do each piece individually. + parts = list(part) + parts + else: + # It's a terminal, wrap it as an encoded word, possibly + # combining it with previously encoded words if allowed. + last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, + part.ew_combine_allowed, charset) + want_encoding = False + continue + if len(tstr) <= maxlen - len(lines[-1]): + lines[-1] += tstr + continue + # This part is too long to fit. The RFC wants us to break at + # "major syntactic breaks", so unless we don't consider this + # to be one, check if it will fit on the next line by itself. + if (part.syntactic_break and + len(tstr) + 1 <= maxlen): + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + continue + if not hasattr(part, 'encode'): + # It's not a terminal, try folding the subparts. + newparts = list(part) + if not part.as_ew_allowed: + wrap_as_ew_blocked += 1 + newparts.append(end_ew_not_allowed) + parts = newparts + parts + continue + if part.as_ew_allowed and not wrap_as_ew_blocked: + # It doesn't need CTE encoding, but encode it anyway so we can + # wrap it. + parts.insert(0, part) + want_encoding = True + continue + # We can't figure out how to wrap, it, so give up. + newline = _steal_trailing_WSP_if_exists(lines) + if newline or part.startswith_fws(): + lines.append(newline + tstr) + else: + # We can't fold it onto the next line either... + lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep + +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): + """Fold string to_encode into lines as encoded word, combining if allowed. + Return the new value for last_ew, or None if ew_combine_allowed is False. + + If there is already an encoded word in the last line of lines (indicated by + a non-None value for last_ew) and ew_combine_allowed is true, decode the + existing ew, combine it with to_encode, and re-encode. Otherwise, encode + to_encode. In either case, split to_encode as necessary so that the + encoded segments fit within maxlen. + + """ + if last_ew is not None and ew_combine_allowed: + to_encode = str( + get_unstructured(lines[-1][last_ew:] + to_encode)) + lines[-1] = lines[-1][:last_ew] + if to_encode[0] in WSP: + # We're joining this to non-encoded text, so don't encode + # the leading blank. + leading_wsp = to_encode[0] + to_encode = to_encode[1:] + if (len(lines[-1]) == maxlen): + lines.append(_steal_trailing_WSP_if_exists(lines)) + lines[-1] += leading_wsp + trailing_wsp = '' + if to_encode[-1] in WSP: + # Likewise for the trailing space. + trailing_wsp = to_encode[-1] + to_encode = to_encode[:-1] + new_last_ew = len(lines[-1]) if last_ew is None else last_ew + while to_encode: + remaining_space = maxlen - len(lines[-1]) + # The RFC2047 chrome takes up 7 characters plus the length + # of the charset name. + encode_as = 'utf-8' if charset == 'us-ascii' else charset + text_space = remaining_space - len(encode_as) - 7 + if text_space <= 0: + lines.append(' ') + # XXX We'll get an infinite loop here if maxlen is <= 7 + continue + first_part = to_encode[:text_space] + ew = _ew.encode(first_part, charset=encode_as) + excess = len(ew) - remaining_space + if excess > 0: + # encode always chooses the shortest encoding, so this + # is guaranteed to fit at this point. + first_part = first_part[:-excess] + ew = _ew.encode(first_part) + lines[-1] += ew + to_encode = to_encode[len(first_part):] + if to_encode: + lines.append(' ') + new_last_ew = len(lines[-1]) + lines[-1] += trailing_wsp + return new_last_ew if ew_combine_allowed else None + +def _fold_mime_parameters(part, lines, maxlen, encoding): + """Fold TokenList 'part' into the 'lines' list as mime parameters. + + Using the decoded list of parameters and values, format them according to + the RFC rules, including using RFC2231 encoding if the value cannot be + expressed in 'encoding' and/or the paramter+value is too long to fit within + 'maxlen'. + + """ + # Special case for RFC2231 encoding: start from decoded values and use + # RFC2231 encoding iff needed. + # + # Note that the 1 and 2s being added to the length calculations are + # accounting for the possibly-needed spaces and semicolons we'll be adding. + # + for name, value in part.params: + # XXX What if this ';' puts us over maxlen the first time through the + # loop? We should split the header value onto a newline in that case, + # but to do that we need to recognize the need earlier or reparse the + # header, so I'm going to ignore that bug for now. It'll only put us + # one character over. + if not lines[-1].rstrip().endswith(';'): + lines[-1] += ';' + charset = encoding + error_handler = 'strict' + try: + value.encode(encoding) + encoding_required = False + except UnicodeEncodeError: + encoding_required = True + if utils._has_surrogates(value): + charset = 'unknown-8bit' + error_handler = 'surrogateescape' + else: + charset = 'utf-8' + if encoding_required: + encoded_value = urllib.parse.quote( + value, safe='', errors=error_handler) + tstr = "{}*={}''{}".format(name, charset, encoded_value) + else: + tstr = '{}={}'.format(name, quote_string(value)) + if len(lines[-1]) + len(tstr) + 1 < maxlen: + lines[-1] = lines[-1] + ' ' + tstr + continue + elif len(tstr) + 2 <= maxlen: + lines.append(' ' + tstr) + continue + # We need multiple sections. We are allowed to mix encoded and + # non-encoded sections, but we aren't going to. We'll encode them all. + section = 0 + extra_chrome = charset + "''" + while value: + chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome) + if maxlen <= chrome_len + 3: + # We need room for the leading blank, the trailing semicolon, + # and at least one character of the value. If we don't + # have that, we'd be stuck, so in that case fall back to + # the RFC standard width. + maxlen = 78 + splitpoint = maxchars = maxlen - chrome_len - 2 + while True: + partial = value[:splitpoint] + encoded_value = urllib.parse.quote( + partial, safe='', errors=error_handler) + if len(encoded_value) <= maxchars: + break + splitpoint -= 1 + lines.append(" {}*{}*={}{}".format( + name, section, extra_chrome, encoded_value)) + extra_chrome = '' + section += 1 + value = value[splitpoint:] + if value: + lines[-1] += ';' diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 81fee146dcc..00652049f2f 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -245,13 +245,16 @@ def fold(self, *, policy): the header name and the ': ' separator. """ - # At some point we need to only put fws here if it was in the source. + # At some point we need to put fws here iif it was in the source. header = parser.Header([ parser.HeaderLabel([ parser.ValueTerminal(self.name, 'header-name'), parser.ValueTerminal(':', 'header-sep')]), - parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), - self._parse_tree]) + ]) + if self._parse_tree: + header.append( + parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')])) + header.append(self._parse_tree) return header.fold(policy=policy) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index e0ec87d2080..1667617b9e4 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -14,18 +14,7 @@ def test_EWWhiteSpaceTerminal(self): self.assertEqual(x, ' \t') self.assertEqual(str(x), '') self.assertEqual(x.value, '') - self.assertEqual(x.encoded, ' \t') - - # UnstructuredTokenList - - def test_undecodable_bytes_error_preserved(self): - badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape') - unst = parser.get_unstructured(badstr) - self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect]) - parts = list(unst.parts) - self.assertDefectsEqual(parts[0].all_defects, []) - self.assertDefectsEqual(parts[1].all_defects, []) - self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect]) + self.assertEqual(x.token_type, 'fws') class TestParserMixin: @@ -139,7 +128,6 @@ def test_get_encoded_word_sets_extra_attributes(self): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, 'jive') @@ -150,7 +138,6 @@ def test_get_encoded_word_lang_default_is_blank(self): 'first second', [], '') - self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=') self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.lang, '') @@ -2700,28 +2687,37 @@ def test_address_list_with_unicode_names_in_quotes(self): # and with unicode tokens in the comments. Spaces inside the quotes # currently don't do the right thing. - def test_initial_whitespace_splitting(self): + def test_split_at_whitespace_after_header_before_long_token(self): body = parser.get_unstructured(' ' + 'x'*77) header = parser.Header([ parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]), parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body]) self._test(header, 'test: \n ' + 'x'*77 + '\n') - def test_whitespace_splitting(self): + def test_split_at_whitespace_before_long_token(self): self._test(parser.get_unstructured('xxx ' + 'y'*77), 'xxx \n ' + 'y'*77 + '\n') + def test_overlong_encodeable_is_wrapped(self): + first_token_with_whitespace = 'xxx ' + chrome_leader = '=?utf-8?q?' + len_chrome = len(chrome_leader) + 2 + len_non_y = len_chrome + len(first_token_with_whitespace) + self._test(parser.get_unstructured(first_token_with_whitespace + + 'y'*80), + first_token_with_whitespace + chrome_leader + + 'y'*(78-len_non_y) + '?=\n' + + ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n') + def test_long_filename_attachment(self): - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n', - folded - ) - folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"') - self.assertEqual( - 'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n', - folded - ) + self._test(parser.parse_content_disposition_header( + 'attachment; filename="TEST_TEST_TEST_TEST' + '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'), + "attachment;\n" + " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST" + "_TEST_TEST_TEST_TEST_TEST;\n" + " filename*1*=_TEST_TES.txt\n", + ) if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c4f182903af..c1aeaefab77 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -27,7 +27,6 @@ def msgmaker(self, msg, policy=None): None """), - # From is wrapped because wrapped it fits in 40. 40: textwrap.dedent("""\ To: whom_it_may_concern(a)example.com From: @@ -40,11 +39,11 @@ def msgmaker(self, msg, policy=None): None """), - # Neither to nor from fit even if put on a new line, - # so we leave them sticking out on the first line. 20: textwrap.dedent("""\ - To: whom_it_may_concern(a)example.com - From: nobody_you_want_to_know(a)example.com + To: + whom_it_may_concern(a)example.com + From: + nobody_you_want_to_know(a)example.com Subject: We the willing led by the unknowing are doing @@ -169,6 +168,53 @@ def test_compat32_max_line_length_does_not_fold_when_none(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0])) + def test_rfc2231_wrapping(self): + # This is pretty much just to make sure we don't have an infinite + # loop; I don't expect anyone to hit this in the field. + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename*0*=us-ascii''afilename; + filename*1*=longenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=33)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + + def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self): + # This is just to make sure we don't have an infinite loop; I don't + # expect anyone to hit this in the field, so I'm not bothering to make + # the result optimal (the encoding isn't needed). + msg = self.msgmaker(self.typ(textwrap.dedent("""\ + To: nobody + Content-Disposition: attachment; + filename="afilenamelongenoghtowraphere" + + None + """))) + expected = textwrap.dedent("""\ + To: nobody + Content-Disposition: + attachment; + filename*0*=us-ascii''afilenamelongenoghtowraphere + + None + """) + s = self.ioclass() + g = self.genclass(s, policy=self.policy.clone(max_line_length=20)) + g.flatten(msg) + self.assertEqual(s.getvalue(), self.typ(expected)) + class TestGenerator(TestGeneratorBase, TestEmailBase): diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index af836dc9726..30ce0ba54e4 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -229,14 +229,14 @@ def content_type_as_value(self, defects = args[1] if l>1 else [] decoded = args[2] if l>2 and args[2] is not DITTO else source header = 'Content-Type:' + ' ' if source else '' - folded = args[3] if l>3 else header + source + '\n' + folded = args[3] if l>3 else header + decoded + '\n' h = self.make_header('Content-Type', source) self.assertEqual(h.content_type, content_type) self.assertEqual(h.maintype, maintype) self.assertEqual(h.subtype, subtype) self.assertEqual(h.params, parmdict) with self.assertRaises(TypeError): - h.params['abc'] = 'xyz' # params is read-only. + h.params['abc'] = 'xyz' # make sure params is read-only. self.assertDefectsEqual(h.defects, defects) self.assertEqual(h, decoded) self.assertEqual(h.fold(policy=policy.default), folded) @@ -373,9 +373,10 @@ def content_type_as_value(self, 'text/plain; Charset="utf-8"'), # Since this is pretty much the ur-mimeheader, we'll put all the tests - # that exercise the parameter parsing and formatting here. - # - # XXX: question: is minimal quoting preferred? + # that exercise the parameter parsing and formatting here. Note that + # when we refold we may canonicalize, so things like whitespace, + # quoting, and rfc2231 encoding may change from what was in the input + # header. 'unquoted_param_value': ( 'text/plain; title=foo', @@ -384,7 +385,8 @@ def content_type_as_value(self, 'plain', {'title': 'foo'}, [], - 'text/plain; title="foo"'), + 'text/plain; title="foo"', + ), 'param_value_with_tspecials': ( 'text/plain; title="(bar)foo blue"', @@ -415,7 +417,8 @@ def content_type_as_value(self, 'mixed', {'boundary': 'CPIMSSMTPC06p5f3tG'}, [], - 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'), + 'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"', + ), 'spaces_around_semis': ( ('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; ' @@ -429,14 +432,31 @@ def content_type_as_value(self, [], ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' 'x-mac-creator="474B4F4E"'), - # XXX: it could be that we will eventually prefer to fold starting - # from the decoded value, in which case these spaces and similar - # spaces in other tests will be wrong. - ('Content-Type: image/jpeg; name="wibble.JPG" ; ' - 'x-mac-type="4A504547" ;\n' + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' ' x-mac-creator="474B4F4E"\n'), ), + 'lots_of_mime_params': ( + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + 'image/jpeg', + 'image', + 'jpeg', + {'name': 'wibble.JPG', + 'x-mac-type': '4A504547', + 'x-mac-creator': '474B4F4E', + 'x-extrastuff': 'make it longer'}, + [], + ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; ' + 'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'), + # In this case the whole of the MimeParameters does *not* fit + # one one line, so we break at a lower syntactic level. + ('Content-Type: image/jpeg; name="wibble.JPG";' + ' x-mac-type="4A504547";\n' + ' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'), + ), + 'semis_inside_quotes': ( 'image/jpeg; name="Jim&&Jill"', 'image/jpeg', @@ -460,19 +480,25 @@ def content_type_as_value(self, [], r'image/jpeg; name="Jim \"Bob\" Jill"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_in_params': ( - # ('foo\xa7/bar; b\xa7r=two; ' - # 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', - # 'surrogateescape')), - # 'foo\uFFFD/bar', - # 'foo\uFFFD', - # 'bar', - # {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, - # [errors.UndecodableBytesDefect]*3, - # 'foo�/bar; b�r="two"; baz="thr�e"', - # ), + 'non_ascii_in_params': ( + ('foo\xa7/bar; b\xa7r=two; ' + 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii', + 'surrogateescape')), + 'foo\uFFFD/bar', + 'foo\uFFFD', + 'bar', + {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'}, + [errors.UndecodableBytesDefect]*3, + 'foo�/bar; b�r="two"; baz="thr�e"', + # XXX Two bugs here: the mime type is not allowed to be an encoded + # word, and we shouldn't be emitting surrogates in the parameter + # names. But I don't know what the behavior should be here, so I'm + # punting for now. In practice this is unlikely to be encountered + # since headers with binary in them only come from a binary source + # and are almost certain to be re-emitted without refolding. + 'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n' + " baz*=unknown-8bit''thr%A7e\n", + ), # RFC 2231 parameter tests. @@ -494,19 +520,20 @@ def content_type_as_value(self, [], r'image/jpeg; bar="baz\"foobar\"baz"'), - # XXX: This test works except for the refolding of the header. I'll - # deal with that bug when I deal with the other folding bugs. - #'non_ascii_rfc2231_value': ( - # ('text/plain; charset=us-ascii; ' - # "title*=us-ascii'en'This%20is%20" - # 'not%20f\xa7n').encode('latin-1').decode('us-ascii', - # 'surrogateescape'), - # 'text/plain', - # 'text', - # 'plain', - # {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, - # [errors.UndecodableBytesDefect], - # 'text/plain; charset="us-ascii"; title="This is not f�n"'), + 'non_ascii_rfc2231_value': ( + ('text/plain; charset=us-ascii; ' + "title*=us-ascii'en'This%20is%20" + 'not%20f\xa7n').encode('latin-1').decode('us-ascii', + 'surrogateescape'), + 'text/plain', + 'text', + 'plain', + {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'}, + [errors.UndecodableBytesDefect], + 'text/plain; charset="us-ascii"; title="This is not f�n"', + 'Content-Type: text/plain; charset="us-ascii";\n' + " title*=unknown-8bit''This%20is%20not%20f%A7n\n", + ), 'rfc2231_encoded_charset': ( 'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii', @@ -529,8 +556,6 @@ def content_type_as_value(self, {'name': 'This is ***fun*** is it not.pdf'}, [], 'text/plain; name="This is ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n' - '\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'), ), # Make sure we also handle it if there are spurious double quotes. @@ -545,9 +570,6 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it not.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it not.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'), ), 'rfc2231_single_quote_inside_double_quotes': ( @@ -562,9 +584,8 @@ def content_type_as_value(self, [errors.InvalidHeaderDefect]*2, ('text/plain; charset="us-ascii"; ' 'title="This is really ***fun*** isn\'t it!"'), - ('Content-Type: text/plain; charset=us-ascii;\n' - '\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n' - '\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'), + ('Content-Type: text/plain; charset="us-ascii";\n' + ' title="This is really ***fun*** isn\'t it!"\n'), ), 'rfc2231_single_quote_in_value_with_charset_and_lang': ( @@ -576,9 +597,6 @@ def content_type_as_value(self, {'name': "Frank's Document"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'Frank\'s";\n' - ' name*1*=" Document"\n'), ), 'rfc2231_single_quote_in_non_encoded_value': ( @@ -590,9 +608,6 @@ def content_type_as_value(self, {'name': "us-ascii'en-us'Frank's Document"}, [], 'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'Frank\'s";\n' - ' name*1=" Document"\n'), ), 'rfc2231_no_language_or_charset': ( @@ -615,12 +630,8 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), - # XXX: see below...the first name line here should be *0 not *0*. 'rfc2231_partly_encoded': ( ("text/plain;" '\tname*0*="\'\'This%20is%20even%20more%20";' @@ -632,9 +643,6 @@ def content_type_as_value(self, {'name': 'This is even more ***fun*** is it.pdf'}, [errors.InvalidHeaderDefect]*2, 'text/plain; name="This is even more ***fun*** is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), ), 'rfc2231_partly_encoded_2': ( @@ -647,10 +655,11 @@ def content_type_as_value(self, 'plain', {'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'}, [errors.InvalidHeaderDefect], - 'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"', - ('Content-Type: text/plain;\t' - 'name*0*="\'\'This%20is%20even%20more%20";\n' - '\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'), + ('text/plain;' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'), + ('Content-Type: text/plain;\n' + ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is' + ' it.pdf"\n'), ), 'rfc2231_unknown_charset_treated_as_ascii': ( @@ -669,9 +678,12 @@ def content_type_as_value(self, 'plain', {'charset': 'utf-8\uFFFD\uFFFD\uFFFD'}, [errors.UndecodableBytesDefect], - 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'), + 'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"', + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%F1%F2%F3\n", + ), - 'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': ( + 'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': ( "text/plain; charset*=ascii''utf-8%E2%80%9D", 'text/plain', 'text', @@ -679,9 +691,11 @@ def content_type_as_value(self, {'charset': 'utf-8”'}, [errors.UndecodableBytesDefect], 'text/plain; charset="utf-8”"', + # XXX Should folding change the charset to utf8? Currently it just + # reproduces the original, which is arguably fine. + "Content-Type: text/plain;" + " charset*=unknown-8bit''utf-8%E2%80%9D\n", ), - # XXX: if the above were *re*folded, it would get tagged as utf-8 - # instead of ascii in the param, since it now contains non-ASCII. 'rfc2231_encoded_then_unencoded_segments': ( ('application/x-foo;' @@ -694,9 +708,6 @@ def content_type_as_value(self, {'name': 'My Document For You'}, [errors.InvalidHeaderDefect], 'application/x-foo; name="My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0*="us-ascii\'en-us\'My";\n' - '\tname*1=" Document";\tname*2=" For You"\n'), ), # My reading of the RFC is that this is an invalid header. The RFC @@ -713,11 +724,6 @@ def content_type_as_value(self, {'name': 'My Document For You'}, [errors.InvalidHeaderDefect]*3, 'application/x-foo; name="My Document For You"', - ("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t" - # XXX: the newline is in the wrong place, come back and fix - # this when the rest of tests pass. - 'name*1*=" Document"\n;' - '\tname*2*=" For You"\n'), ), # XXX: I would say this one should default to ascii/en for the @@ -730,8 +736,7 @@ def content_type_as_value(self, # charset'lang'value pattern exactly *and* there is at least one # encoded segment. Implementing that algorithm will require some # refactoring, so I haven't done it (yet). - - 'rfc2231_qouted_unencoded_then_encoded_segments': ( + 'rfc2231_quoted_unencoded_then_encoded_segments': ( ('application/x-foo;' '\tname*0="us-ascii\'en-us\'My";' '\tname*1*=" Document";' @@ -742,9 +747,25 @@ def content_type_as_value(self, {'name': "us-ascii'en-us'My Document For You"}, [errors.InvalidHeaderDefect]*2, 'application/x-foo; name="us-ascii\'en-us\'My Document For You"', - ('Content-Type: application/x-foo;\t' - 'name*0="us-ascii\'en-us\'My";\n' - '\tname*1*=" Document";\tname*2*=" For You"\n'), + ), + + # Make sure our folding algorithm produces multiple sections correctly. + # We could mix encoded and non-encoded segments, but we don't, we just + # make them all encoded. It might be worth fixing that, since the + # sections can get used for wrapping ascii text. + 'rfc2231_folded_segments_correctly_formatted': ( + ('application/x-foo;' + '\tname="' + "with spaces"*8 + '"'), + 'application/x-foo', + 'application', + 'x-foo', + {'name': "with spaces"*8}, + [], + 'application/x-foo; name="' + "with spaces"*8 + '"', + "Content-Type: application/x-foo;\n" + " name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith" + "%20spaceswith;\n" + " name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n" ), } @@ -827,8 +848,8 @@ def content_disp_as_value(self, [], ('attachment; filename="genome.jpeg"; ' 'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'), - ('Content-Disposition: attachment; filename=genome.jpeg;\n' - ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'), + ('Content-Disposition: attachment; filename="genome.jpeg";\n' + ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'), ), 'no_value': ( @@ -873,7 +894,7 @@ def version_string_as_MIME_Version(self, if source: source = ' ' + source self.assertEqual(h.fold(policy=policy.default), - 'MIME-Version:' + source + '\n') + 'MIME-Version:' + source + '\n') version_string_params = { @@ -1546,15 +1567,39 @@ def test_fold_unstructured_with_overlong_word(self): 'singlewordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) def test_fold_unstructured_with_two_overlong_words(self): h = self.make_header('Subject', 'thisisaverylonglineconsistingofa' 'singlewordthatwontfit plusanotherverylongwordthatwontfit') self.assertEqual( h.fold(policy=policy.default.clone(max_line_length=20)), - 'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n' - ' plusanotherverylongwordthatwontfit\n') + 'Subject: \n' + ' =?utf-8?q?thisisa?=\n' + ' =?utf-8?q?verylon?=\n' + ' =?utf-8?q?glineco?=\n' + ' =?utf-8?q?nsistin?=\n' + ' =?utf-8?q?gofasin?=\n' + ' =?utf-8?q?gleword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit_pl?=\n' + ' =?utf-8?q?usanoth?=\n' + ' =?utf-8?q?erveryl?=\n' + ' =?utf-8?q?ongword?=\n' + ' =?utf-8?q?thatwon?=\n' + ' =?utf-8?q?tfit?=\n' + ) + + # XXX Need test for when max_line_length is less than the chrome size. def test_fold_unstructured_with_slightly_long_word(self): h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen') @@ -1590,6 +1635,18 @@ def test_fold_date_header(self): self.assertEqual(h.fold(policy=policy.default), 'Date: Sat, 02 Feb 2002 17:00:06 -0800\n') + def test_fold_overlong_words_using_RFC2047(self): + h = self.make_header( + 'X-Report-Abuse', + '<https://www.mailitapp.com/report_abuse.php?' + 'mid=xxx-xxx-xxxxxxxxxxxxxxxxxxxxxxxx==-xxx-xx-xx>') + self.assertEqual( + h.fold(policy=policy.default), + 'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E' + 'com/report=5F?=\n' + ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx' + 'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n' + ' =?utf-8?q?xx-xx=3E?=\n') if __name__ == '__main__': diff --git a/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst new file mode 100644 index 00000000000..c933ee7d916 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst @@ -0,0 +1,3 @@ +The header folding algorithm for the new email policies has been rewritten, +which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231 +folding is now done correctly.

1 0

bpo-31619: Fixed integer overflow in converting huge strings to int. (GH-3884) (#4690)
by Serhiy Storchaka 03 Dec '17

03 Dec '17

https://github.com/python/cpython/commit/30a6bc842945e3e9c9c7db887ab495c428… commit: 30a6bc842945e3e9c9c7db887ab495c428ec7074 branch: 3.6 author: Miss Islington (bot) <31488909+miss-islington(a)users.noreply.github.com> committer: Serhiy Storchaka <storchaka(a)gmail.com> date: 2017-12-03T23:27:21+02:00 summary: bpo-31619: Fixed integer overflow in converting huge strings to int. (GH-3884) (#4690) (cherry picked from commit 29ba688034fc4eef0693b86002cf7bee55d692af) files: M Objects/longobject.c diff --git a/Objects/longobject.c b/Objects/longobject.c index c0cd7c12be2..c3c0949189d 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2016,7 +2016,7 @@ long_from_binary_base(const char **str, int base, PyLongObject **res) const char *p = *str; const char *start = p; char prev = 0; - int digits = 0; + Py_ssize_t digits = 0; int bits_per_char; Py_ssize_t n; PyLongObject *z; @@ -2259,8 +2259,9 @@ just 1 digit at the start, so that the copying code was exercised for every digit beyond the first. ***/ twodigits c; /* current input character */ + double fsize_z; Py_ssize_t size_z; - int digits = 0; + Py_ssize_t digits = 0; int i; int convwidth; twodigits convmultmax, convmult; @@ -2322,7 +2323,14 @@ digit beyond the first. * need to initialize z->ob_digit -- no slot is read up before * being stored into. */ - size_z = (Py_ssize_t)(digits * log_base_BASE[base]) + 1; + fsize_z = digits * log_base_BASE[base] + 1; + if (fsize_z > MAX_LONG_DIGITS) { + /* The same exception as in _PyLong_New(). */ + PyErr_SetString(PyExc_OverflowError, + "too many digits in integer"); + return NULL; + } + size_z = (Py_ssize_t)fsize_z; /* Uncomment next line to test exceedingly rare copy code */ /* size_z = 1; */ assert(size_z > 0);

1 0

bpo-31619: Fixed integer overflow in converting huge strings to int. (#3884)
by Serhiy Storchaka 03 Dec '17

03 Dec '17

https://github.com/python/cpython/commit/29ba688034fc4eef0693b86002cf7bee55… commit: 29ba688034fc4eef0693b86002cf7bee55d692af branch: master author: Serhiy Storchaka <storchaka(a)gmail.com> committer: GitHub <noreply(a)github.com> date: 2017-12-03T22:16:21+02:00 summary: bpo-31619: Fixed integer overflow in converting huge strings to int. (#3884) files: M Objects/longobject.c diff --git a/Objects/longobject.c b/Objects/longobject.c index c3dc59ee9e9..a7f496825eb 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2024,7 +2024,7 @@ long_from_binary_base(const char **str, int base, PyLongObject **res) const char *p = *str; const char *start = p; char prev = 0; - int digits = 0; + Py_ssize_t digits = 0; int bits_per_char; Py_ssize_t n; PyLongObject *z; @@ -2267,8 +2267,9 @@ just 1 digit at the start, so that the copying code was exercised for every digit beyond the first. ***/ twodigits c; /* current input character */ + double fsize_z; Py_ssize_t size_z; - int digits = 0; + Py_ssize_t digits = 0; int i; int convwidth; twodigits convmultmax, convmult; @@ -2330,7 +2331,14 @@ digit beyond the first. * need to initialize z->ob_digit -- no slot is read up before * being stored into. */ - size_z = (Py_ssize_t)(digits * log_base_BASE[base]) + 1; + fsize_z = digits * log_base_BASE[base] + 1; + if (fsize_z > MAX_LONG_DIGITS) { + /* The same exception as in _PyLong_New(). */ + PyErr_SetString(PyExc_OverflowError, + "too many digits in integer"); + return NULL; + } + size_z = (Py_ssize_t)fsize_z; /* Uncomment next line to test exceedingly rare copy code */ /* size_z = 1; */ assert(size_z > 0);

1 0

bpo-32137: The repr of deeply nested dict now raises a RecursionError (#4570)
by Serhiy Storchaka 03 Dec '17

03 Dec '17

https://github.com/python/cpython/commit/1fb72d2ad243c965d4432b4e9388406400… commit: 1fb72d2ad243c965d4432b4e93884064001a2607 branch: master author: Serhiy Storchaka <storchaka(a)gmail.com> committer: GitHub <noreply(a)github.com> date: 2017-12-03T22:12:11+02:00 summary: bpo-32137: The repr of deeply nested dict now raises a RecursionError (#4570) instead of crashing due to a stack overflow. This perhaps will fix similar problems in other extension types. files: A Misc/NEWS.d/next/Core and Builtins/2017-11-26-14-36-30.bpo-32137.Stj5nL.rst M Lib/test/list_tests.py M Lib/test/mapping_tests.py M Lib/test/test_dict.py M Objects/listobject.c M Objects/object.c M Objects/tupleobject.c diff --git a/Lib/test/list_tests.py b/Lib/test/list_tests.py index ce9db9a1b8b..ed63fda20c1 100644 --- a/Lib/test/list_tests.py +++ b/Lib/test/list_tests.py @@ -53,10 +53,11 @@ def test_repr(self): self.assertEqual(str(a2), "[0, 1, 2, [...], 3]") self.assertEqual(repr(a2), "[0, 1, 2, [...], 3]") - l0 = [] + def test_repr_deep(self): + a = self.type2test([]) for i in range(sys.getrecursionlimit() + 100): - l0 = [l0] - self.assertRaises(RecursionError, repr, l0) + a = self.type2test([a]) + self.assertRaises(RecursionError, repr, a) def test_print(self): d = self.type2test(range(200)) diff --git a/Lib/test/mapping_tests.py b/Lib/test/mapping_tests.py index ff82f4eb7d8..53f29f60538 100644 --- a/Lib/test/mapping_tests.py +++ b/Lib/test/mapping_tests.py @@ -1,6 +1,7 @@ # tests common to dict and UserDict import unittest import collections +import sys class BasicTestMappingProtocol(unittest.TestCase): @@ -619,6 +620,14 @@ def __repr__(self): d = self._full_mapping({1: BadRepr()}) self.assertRaises(Exc, repr, d) + def test_repr_deep(self): + d = self._empty_mapping() + for i in range(sys.getrecursionlimit() + 100): + d0 = d + d = self._empty_mapping() + d[1] = d0 + self.assertRaises(RecursionError, repr, d) + def test_eq(self): self.assertEqual(self._empty_mapping(), self._empty_mapping()) self.assertEqual(self._full_mapping({1: 2}), diff --git a/Lib/test/test_dict.py b/Lib/test/test_dict.py index 8013f37c88d..4386eda3ae4 100644 --- a/Lib/test/test_dict.py +++ b/Lib/test/test_dict.py @@ -468,6 +468,12 @@ def __repr__(self): d = {1: BadRepr()} self.assertRaises(Exc, repr, d) + def test_repr_deep(self): + d = {} + for i in range(sys.getrecursionlimit() + 100): + d = {1: d} + self.assertRaises(RecursionError, repr, d) + def test_eq(self): self.assertEqual({}, {}) self.assertEqual({1: 2}, {1: 2}) diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-11-26-14-36-30.bpo-32137.Stj5nL.rst b/Misc/NEWS.d/next/Core and Builtins/2017-11-26-14-36-30.bpo-32137.Stj5nL.rst new file mode 100644 index 00000000000..f8f4ab93c9e --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2017-11-26-14-36-30.bpo-32137.Stj5nL.rst @@ -0,0 +1,2 @@ +The repr of deeply nested dict now raises a RecursionError instead of +crashing due to a stack overflow. diff --git a/Objects/listobject.c b/Objects/listobject.c index 8576b7ae683..8794e37364a 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -364,10 +364,7 @@ list_repr(PyListObject *v) goto error; } - if (Py_EnterRecursiveCall(" while getting the repr of a list")) - goto error; s = PyObject_Repr(v->ob_item[i]); - Py_LeaveRecursiveCall(); if (s == NULL) goto error; diff --git a/Objects/object.c b/Objects/object.c index 674180d7203..a0d651d0805 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -463,7 +463,12 @@ PyObject_Repr(PyObject *v) assert(!PyErr_Occurred()); #endif + /* It is possible for a type to have a tp_repr representation that loops + infinitely. */ + if (Py_EnterRecursiveCall(" while getting the repr of an object")) + return NULL; res = (*v->ob_type->tp_repr)(v); + Py_LeaveRecursiveCall(); if (res == NULL) return NULL; if (!PyUnicode_Check(res)) { diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index 964db3bb8de..3a609461251 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -303,10 +303,7 @@ tuplerepr(PyTupleObject *v) goto error; } - if (Py_EnterRecursiveCall(" while getting the repr of a tuple")) - goto error; s = PyObject_Repr(v->ob_item[i]); - Py_LeaveRecursiveCall(); if (s == NULL) goto error;

1 0

Refactor PyImport_ImportModuleLevelObject(). (#4680)
by Neil Schemenauer 03 Dec '17

03 Dec '17

https://github.com/python/cpython/commit/eea3cc1ef0dec0af193eedb4c1164263fb… commit: eea3cc1ef0dec0af193eedb4c1164263fbdfd8cc branch: master author: Neil Schemenauer <nas-github(a)arctrix.com> committer: GitHub <noreply(a)github.com> date: 2017-12-03T09:26:03-08:00 summary: Refactor PyImport_ImportModuleLevelObject(). (#4680) Add import_find_and_load() helper function. The addition of the importtime option has made PyImport_ImportModuleLevelObject() large and so using a helper seems worthwhile. It also makes it clearer that abs_name is the only argument needed by _find_and_load(). files: M Python/import.c diff --git a/Python/import.c b/Python/import.c index d2ed785c9f9..57521e49207 100644 --- a/Python/import.c +++ b/Python/import.c @@ -1589,12 +1589,67 @@ resolve_name(PyObject *name, PyObject *globals, int level) return NULL; } +static PyObject * +import_find_and_load(PyObject *abs_name) +{ + _Py_IDENTIFIER(_find_and_load); + PyObject *mod = NULL; + PyInterpreterState *interp = PyThreadState_GET()->interp; + int import_time = interp->core_config.import_time; + static int import_level; + static _PyTime_t accumulated; + + _PyTime_t t1 = 0, accumulated_copy = accumulated; + + /* XOptions is initialized after first some imports. + * So we can't have negative cache before completed initialization. + * Anyway, importlib._find_and_load is much slower than + * _PyDict_GetItemIdWithError(). + */ + if (import_time) { + static int header = 1; + if (header) { + fputs("import time: self [us] | cumulative | imported package\n", + stderr); + header = 0; + } + + import_level++; + t1 = _PyTime_GetPerfCounter(); + accumulated = 0; + } + + if (PyDTrace_IMPORT_FIND_LOAD_START_ENABLED()) + PyDTrace_IMPORT_FIND_LOAD_START(PyUnicode_AsUTF8(abs_name)); + + mod = _PyObject_CallMethodIdObjArgs(interp->importlib, + &PyId__find_and_load, abs_name, + interp->import_func, NULL); + + if (PyDTrace_IMPORT_FIND_LOAD_DONE_ENABLED()) + PyDTrace_IMPORT_FIND_LOAD_DONE(PyUnicode_AsUTF8(abs_name), + mod != NULL); + + if (import_time) { + _PyTime_t cum = _PyTime_GetPerfCounter() - t1; + + import_level--; + fprintf(stderr, "import time: %9ld | %10ld | %*s%s\n", + (long)_PyTime_AsMicroseconds(cum - accumulated, _PyTime_ROUND_CEILING), + (long)_PyTime_AsMicroseconds(cum, _PyTime_ROUND_CEILING), + import_level*2, "", PyUnicode_AsUTF8(abs_name)); + + accumulated = accumulated_copy + cum; + } + + return mod; +} + PyObject * PyImport_ImportModuleLevelObject(PyObject *name, PyObject *globals, PyObject *locals, PyObject *fromlist, int level) { - _Py_IDENTIFIER(_find_and_load); _Py_IDENTIFIER(_handle_fromlist); PyObject *abs_name = NULL; PyObject *final_mod = NULL; @@ -1674,55 +1729,7 @@ PyImport_ImportModuleLevelObject(PyObject *name, PyObject *globals, } } else { - int import_time = interp->core_config.import_time; - static int import_level; - static _PyTime_t accumulated; - - _PyTime_t t1 = 0, accumulated_copy = accumulated; - - /* XOptions is initialized after first some imports. - * So we can't have negative cache before completed initialization. - * Anyway, importlib._find_and_load is much slower than - * _PyDict_GetItemIdWithError(). - */ - if (import_time) { - static int header = 1; - if (header) { - fputs("import time: self [us] | cumulative | imported package\n", - stderr); - header = 0; - } - - import_level++; - t1 = _PyTime_GetPerfCounter(); - accumulated = 0; - } - - Py_XDECREF(mod); - - if (PyDTrace_IMPORT_FIND_LOAD_START_ENABLED()) - PyDTrace_IMPORT_FIND_LOAD_START(PyUnicode_AsUTF8(abs_name)); - - mod = _PyObject_CallMethodIdObjArgs(interp->importlib, - &PyId__find_and_load, abs_name, - interp->import_func, NULL); - - if (PyDTrace_IMPORT_FIND_LOAD_DONE_ENABLED()) - PyDTrace_IMPORT_FIND_LOAD_DONE(PyUnicode_AsUTF8(abs_name), - mod != NULL); - - if (import_time) { - _PyTime_t cum = _PyTime_GetPerfCounter() - t1; - - import_level--; - fprintf(stderr, "import time: %9ld | %10ld | %*s%s\n", - (long)_PyTime_AsMicroseconds(cum - accumulated, _PyTime_ROUND_CEILING), - (long)_PyTime_AsMicroseconds(cum, _PyTime_ROUND_CEILING), - import_level*2, "", PyUnicode_AsUTF8(abs_name)); - - accumulated = accumulated_copy + cum; - } - + mod = import_find_and_load(abs_name); if (mod == NULL) { goto error; }

1 0