[Python-checkins] cpython: Issue #2175: SAX parsers now support a character stream of InputSource object.
serhiy.storchaka
python-checkins at python.org
Thu Apr 2 20:00:59 CEST 2015
https://hg.python.org/cpython/rev/407883c52bf3
changeset: 95401:407883c52bf3
user: Serhiy Storchaka <storchaka at gmail.com>
date: Thu Apr 02 21:00:13 2015 +0300
summary:
Issue #2175: SAX parsers now support a character stream of InputSource object.
files:
Doc/library/xml.sax.reader.rst | 12 ++++----
Doc/whatsnew/3.5.rst | 7 ++++
Lib/test/test_sax.py | 33 ++++++++++++++++++++++
Lib/xml/sax/expatreader.py | 11 +++++--
Lib/xml/sax/saxutils.py | 7 +++-
Lib/xml/sax/xmlreader.py | 4 ++-
Misc/NEWS | 2 +
7 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/Doc/library/xml.sax.reader.rst b/Doc/library/xml.sax.reader.rst
--- a/Doc/library/xml.sax.reader.rst
+++ b/Doc/library/xml.sax.reader.rst
@@ -100,8 +100,10 @@
system identifier (a string identifying the input source -- typically a file
name or an URL), a file-like object, or an :class:`InputSource` object. When
:meth:`parse` returns, the input is completely processed, and the parser object
- can be discarded or reset. As a limitation, the current implementation only
- accepts byte streams; processing of character streams is for further study.
+ can be discarded or reset.
+
+ .. versionchanged:: 3.5
+ Added support of character streams.
.. method:: XMLReader.getContentHandler()
@@ -288,8 +290,7 @@
.. method:: InputSource.setByteStream(bytefile)
- Set the byte stream (a Python file-like object which does not perform
- byte-to-character conversion) for this input source.
+ Set the byte stream (a :term:`binary file`) for this input source.
The SAX parser will ignore this if there is also a character stream specified,
but it will use a byte stream in preference to opening a URI connection itself.
@@ -308,8 +309,7 @@
.. method:: InputSource.setCharacterStream(charfile)
- Set the character stream for this input source. (The stream must be a Python 1.6
- Unicode-wrapped file-like that performs conversion to strings.)
+ Set the character stream (a :term:`text file`) for this input source.
If there is a character stream specified, the SAX parser will ignore any byte
stream and will not attempt to open a URI connection to the system identifier.
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -499,6 +499,13 @@
* :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`.
(Contributed by Claudiu Popa in :issue:`20627`.)
+xml.sax
+-------
+
+* SAX parsers now support a character stream of
+ :class:`~xml.sax.xmlreader.InputSource` object.
+ (Contributed by Serhiy Storchaka in :issue:`2175`.)
+
faulthandler
------------
diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py
--- a/Lib/test/test_sax.py
+++ b/Lib/test/test_sax.py
@@ -185,12 +185,24 @@
def make_byte_stream(self):
return BytesIO(b"This is a byte stream.")
+ def make_character_stream(self):
+ return StringIO("This is a character stream.")
+
def checkContent(self, stream, content):
self.assertIsNotNone(stream)
self.assertEqual(stream.read(), content)
stream.close()
+ def test_character_stream(self):
+ # If the source is an InputSource with a character stream, use it.
+ src = InputSource(self.file)
+ src.setCharacterStream(self.make_character_stream())
+ prep = prepare_input_source(src)
+ self.assertIsNone(prep.getByteStream())
+ self.checkContent(prep.getCharacterStream(),
+ "This is a character stream.")
+
def test_byte_stream(self):
# If the source is an InputSource that does not have a character
# stream but does have a byte stream, use the byte stream.
@@ -225,6 +237,14 @@
self.checkContent(prep.getByteStream(),
b"This is a byte stream.")
+ def test_text_file(self):
+ # If the source is a text file-like object, use it as a character
+ # stream.
+ prep = prepare_input_source(self.make_character_stream())
+ self.assertIsNone(prep.getByteStream())
+ self.checkContent(prep.getCharacterStream(),
+ "This is a character stream.")
+
# ===== XMLGenerator
@@ -904,6 +924,19 @@
self.assertEqual(result.getvalue(), xml_test_out)
+ def test_expat_inpsource_character_stream(self):
+ parser = create_parser()
+ result = BytesIO()
+ xmlgen = XMLGenerator(result)
+
+ parser.setContentHandler(xmlgen)
+ inpsrc = InputSource()
+ with open(TEST_XMLFILE, 'rt', encoding='iso-8859-1') as f:
+ inpsrc.setCharacterStream(f)
+ parser.parse(inpsrc)
+
+ self.assertEqual(result.getvalue(), xml_test_out)
+
# ===== IncrementalParser support
def test_expat_incremental(self):
diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py
--- a/Lib/xml/sax/expatreader.py
+++ b/Lib/xml/sax/expatreader.py
@@ -219,9 +219,14 @@
self._parsing = 0
# break cycle created by expat handlers pointing to our methods
self._parser = None
- bs = self._source.getByteStream()
- if bs is not None:
- bs.close()
+ try:
+ file = self._source.getCharacterStream()
+ if file is not None:
+ file.close()
+ finally:
+ file = self._source.getByteStream()
+ if file is not None:
+ file.close()
def _reset_cont_handler(self):
self._parser.ProcessingInstructionHandler = \
diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py
--- a/Lib/xml/sax/saxutils.py
+++ b/Lib/xml/sax/saxutils.py
@@ -345,11 +345,14 @@
elif hasattr(source, "read"):
f = source
source = xmlreader.InputSource()
- source.setByteStream(f)
+ if isinstance(f.read(0), str):
+ source.setCharacterStream(f)
+ else:
+ source.setByteStream(f)
if hasattr(f, "name") and isinstance(f.name, str):
source.setSystemId(f.name)
- if source.getByteStream() is None:
+ if source.getCharacterStream() is None and source.getByteStream() is None:
sysid = source.getSystemId()
basehead = os.path.dirname(os.path.normpath(base))
sysidfilename = os.path.join(basehead, sysid)
diff --git a/Lib/xml/sax/xmlreader.py b/Lib/xml/sax/xmlreader.py
--- a/Lib/xml/sax/xmlreader.py
+++ b/Lib/xml/sax/xmlreader.py
@@ -117,7 +117,9 @@
source = saxutils.prepare_input_source(source)
self.prepareParser(source)
- file = source.getByteStream()
+ file = source.getCharacterStream()
+ if file is None:
+ file = source.getByteStream()
buffer = file.read(self._bufsize)
while buffer:
self.feed(buffer)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -16,6 +16,8 @@
Library
-------
+- Issue #2175: SAX parsers now support a character stream of InputSource object.
+
- Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and
arbitrary precision integers added in Tcl 8.5.
--
Repository URL: https://hg.python.org/cpython
More information about the Python-checkins
mailing list