Something like this in the docstring?: "In order to support the historical JSON specification and closed ecosystem JSON, it is possible to specify an encoding other than UTF-8." 8.1. Character Encoding
JSON text exchanged between systems that are not part of a closed ecosystem MUST be encoded using UTF-8 [RFC3629]. Previous specifications of JSON have not required the use of UTF-8 when transmitting JSON text. However, the vast majority of JSON- based software implementations have chosen to use the UTF-8 encoding, to the extent that it is the only encoding that achieves interoperability. Implementations MUST NOT add a byte order mark (U+FEFF) to the beginning of a networked-transmitted JSON text. In the interests of interoperability, implementations that parse JSON texts MAY ignore the presence of a byte order mark rather than treating it as an error.
```python import json import os def dumpf(obj, path, *, encoding="UTF-8", **kwargs): with open(os.fspath(path), "w", encoding=encoding) as f: return json.dump(obj, f, **kwargs) def loadf(path, *, encoding="UTF-8", **kwargs): with open(os.fspath(path), "r", encoding=encoding) as f: return json.load(f, **kwargs) import pathlib import unittest class TestJsonLoadfAndDumpf(unittest.TestCase): def setUp(self): self.encodings = [None, "UTF-8", "UTF-16", "UTF-32"] data = dict( obj=dict(a=dict(b=[1, 2, 3])), path=pathlib.Path(".") / "test_loadf_and_dumpf.json", ) if os.path.isfile(data["path"]): os.unlink(data["path"]) self.data = data def test_dumpf_and_loadf(self): data = self.data for encoding in self.encodings: path = f'{data["path"]}.{encoding}.json' dumpf_output = dumpf(data["obj"], path, encoding=encoding) loadf_output = loadf(path, encoding=encoding) assert loadf_output == data["obj"] # $ pip install pytest-cov # $ pytest -v example.py # https://docs.pytest.org/en/stable/parametrize.html # https://docs.pytest.org/en/stable/tmpdir.html import pytest @pytest.mark.parametrize("encoding", [None, "UTF-8", "UTF-16", "UTF-32"]) @pytest.mark.parametrize("obj", [dict(a=dict(b=[1, 2, 3]))]) def test_dumpf_and_loadf(obj, encoding, tmpdir): pth = pathlib.Path(tmpdir) / f"test_loadf_and_dumpf.{encoding}.json" dumpf_output = dumpf(obj, pth, encoding=encoding) loadf_output = loadf(pth, encoding=encoding) assert loadf_output == obj ``` For whoever creates a PR for this: - [ ] add parameter and return type annotations - [ ] copy docstrings from json.load/json.dump and open#encoding - [ ] correctly support the c module implementation (this just does `import json`)? - [ ] keep or drop the encoding tests? On Thu, Sep 17, 2020 at 1:25 AM Christopher Barker <pythonchb@gmail.com> wrote:
Is that suggested code? I don't follow.
But if it is, no. personally, I think ANY use of system settings is a bad idea [*]. But certainly no need to even think about it for JSON.
-CHB
* have we not learned that in the age of the internet the machine the code happens to be running on has nothing to do with the user of the applications' needs? Timezones, encodings, number formats, NOTHING.
On Wed, Sep 16, 2020 at 8:45 PM Wes Turner <wes.turner@gmail.com> wrote:
Is all of this locale/encoding testing necessary (or even sufficient)?
```python import json import locale import os
def get_default_encoding(): """ TODO XXX: ??? """ default_encoding = locale.getdefaultlocale()[1] if default_encoding.startswith("UTF-"): return default_encoding else: return "UTF-8"
def dumpf(obj, path, *args, **kwargs): with open( os.fspath(path), "w", encoding=kwargs.pop("encoding", get_default_encoding()), ) as file_: return json.dump(obj, file_, *args, **kwargs)
def loadf(path, *args, **kwargs): with open( os.fspath(path), "r", encoding=kwargs.pop("encoding", get_default_encoding()), ) as file_: return json.load(file_, *args, **kwargs)
import pathlib import unittest
class TestJsonLoadfAndDumpf(unittest.TestCase): def setUp(self): self.locales = ["", "C", "en_US.UTF-8", "japanese"] self.encodings = [None, "UTF-8", "UTF-16", "UTF-32"]
data = dict( obj=dict(a=dict(b=[1, 2, 3])), encoding=None, path=pathlib.Path(".") / "test_loadf_and_dumpf.json", ) if os.path.isfile(data["path"]): os.unlink(data["path"]) self.data = data
self.previous_locale = locale.getlocale()
def tearDown(self): locale.setlocale(locale.LC_ALL, self.previous_locale)
def test_get_default_encoding(self): for localestr in self.locales: locale.setlocale(locale.LC_ALL, localestr) output = get_default_encoding() assert output.startswith("UTF-")
def test_dumpf_and_loadf(self): data = self.data for localestr in self.locales: locale.setlocale(locale.LC_ALL, localestr) for encoding in self.encodings: dumpf_output = dumpf( data["obj"], data["path"], encoding=encoding ) loadf_output = loadf(data["path"], encoding=encoding) assert loadf_output == data["obj"] ```
On Wed, Sep 16, 2020 at 8:30 PM Christopher Barker <pythonchb@gmail.com> wrote:
On Wed, Sep 16, 2020 at 2:53 PM Wes Turner <wes.turner@gmail.com> wrote:
So I was not correct: dump does not default to UTF-8 (and does not accept an encoding= parameter)
I think dumpf() should use UTF-8, and that's it. If anyone really wants something else, they can get it by providing an open text file object.
Why would we impose UTF-8 when the spec says UTF-8, UTF-16, or UTF-32?
The idea was that the encoding was one of the motivators to doing this in the first place. But I suppose as long as utf-8 is the default, and only the three "official" ones are allowed, then yeah, we could add an encoding keyword argument.
-CHB
-- Christopher Barker, PhD
Python Language Consulting - Teaching - Scientific Software Development - Desktop GUI and Web Development - wxPython, numpy, scipy, Cython
-- Christopher Barker, PhD
Python Language Consulting - Teaching - Scientific Software Development - Desktop GUI and Web Development - wxPython, numpy, scipy, Cython