[Python-checkins] cpython: Issue #2175: SAX parsers now support a character stream of InputSource object.

serhiy.storchaka python-checkins at python.org
Thu Apr 2 20:00:59 CEST 2015


https://hg.python.org/cpython/rev/407883c52bf3
changeset:   95401:407883c52bf3
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Thu Apr 02 21:00:13 2015 +0300
summary:
  Issue #2175: SAX parsers now support a character stream of InputSource object.

files:
  Doc/library/xml.sax.reader.rst |  12 ++++----
  Doc/whatsnew/3.5.rst           |   7 ++++
  Lib/test/test_sax.py           |  33 ++++++++++++++++++++++
  Lib/xml/sax/expatreader.py     |  11 +++++--
  Lib/xml/sax/saxutils.py        |   7 +++-
  Lib/xml/sax/xmlreader.py       |   4 ++-
  Misc/NEWS                      |   2 +
  7 files changed, 64 insertions(+), 12 deletions(-)


diff --git a/Doc/library/xml.sax.reader.rst b/Doc/library/xml.sax.reader.rst
--- a/Doc/library/xml.sax.reader.rst
+++ b/Doc/library/xml.sax.reader.rst
@@ -100,8 +100,10 @@
    system identifier (a string identifying the input source -- typically a file
    name or an URL), a file-like object, or an :class:`InputSource` object. When
    :meth:`parse` returns, the input is completely processed, and the parser object
-   can be discarded or reset. As a limitation, the current implementation only
-   accepts byte streams; processing of character streams is for further study.
+   can be discarded or reset.
+
+   .. versionchanged:: 3.5
+      Added support of character streams.
 
 
 .. method:: XMLReader.getContentHandler()
@@ -288,8 +290,7 @@
 
 .. method:: InputSource.setByteStream(bytefile)
 
-   Set the byte stream (a Python file-like object which does not perform
-   byte-to-character conversion) for this input source.
+   Set the byte stream (a :term:`binary file`) for this input source.
 
    The SAX parser will ignore this if there is also a character stream specified,
    but it will use a byte stream in preference to opening a URI connection itself.
@@ -308,8 +309,7 @@
 
 .. method:: InputSource.setCharacterStream(charfile)
 
-   Set the character stream for this input source. (The stream must be a Python 1.6
-   Unicode-wrapped file-like that performs conversion to strings.)
+   Set the character stream (a :term:`text file`) for this input source.
 
    If there is a character stream specified, the SAX parser will ignore any byte
    stream and will not attempt to open a URI connection to the system identifier.
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -499,6 +499,13 @@
 * :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`.
   (Contributed by Claudiu Popa in :issue:`20627`.)
 
+xml.sax
+-------
+
+* SAX parsers now support a character stream of
+  :class:`~xml.sax.xmlreader.InputSource` object.
+  (Contributed by Serhiy Storchaka in :issue:`2175`.)
+
 faulthandler
 ------------
 
diff --git a/Lib/test/test_sax.py b/Lib/test/test_sax.py
--- a/Lib/test/test_sax.py
+++ b/Lib/test/test_sax.py
@@ -185,12 +185,24 @@
     def make_byte_stream(self):
         return BytesIO(b"This is a byte stream.")
 
+    def make_character_stream(self):
+        return StringIO("This is a character stream.")
+
     def checkContent(self, stream, content):
         self.assertIsNotNone(stream)
         self.assertEqual(stream.read(), content)
         stream.close()
 
 
+    def test_character_stream(self):
+        # If the source is an InputSource with a character stream, use it.
+        src = InputSource(self.file)
+        src.setCharacterStream(self.make_character_stream())
+        prep = prepare_input_source(src)
+        self.assertIsNone(prep.getByteStream())
+        self.checkContent(prep.getCharacterStream(),
+                          "This is a character stream.")
+
     def test_byte_stream(self):
         # If the source is an InputSource that does not have a character
         # stream but does have a byte stream, use the byte stream.
@@ -225,6 +237,14 @@
         self.checkContent(prep.getByteStream(),
                           b"This is a byte stream.")
 
+    def test_text_file(self):
+        # If the source is a text file-like object, use it as a character
+        # stream.
+        prep = prepare_input_source(self.make_character_stream())
+        self.assertIsNone(prep.getByteStream())
+        self.checkContent(prep.getCharacterStream(),
+                          "This is a character stream.")
+
 
 # ===== XMLGenerator
 
@@ -904,6 +924,19 @@
 
         self.assertEqual(result.getvalue(), xml_test_out)
 
+    def test_expat_inpsource_character_stream(self):
+        parser = create_parser()
+        result = BytesIO()
+        xmlgen = XMLGenerator(result)
+
+        parser.setContentHandler(xmlgen)
+        inpsrc = InputSource()
+        with open(TEST_XMLFILE, 'rt', encoding='iso-8859-1') as f:
+            inpsrc.setCharacterStream(f)
+            parser.parse(inpsrc)
+
+        self.assertEqual(result.getvalue(), xml_test_out)
+
     # ===== IncrementalParser support
 
     def test_expat_incremental(self):
diff --git a/Lib/xml/sax/expatreader.py b/Lib/xml/sax/expatreader.py
--- a/Lib/xml/sax/expatreader.py
+++ b/Lib/xml/sax/expatreader.py
@@ -219,9 +219,14 @@
         self._parsing = 0
         # break cycle created by expat handlers pointing to our methods
         self._parser = None
-        bs = self._source.getByteStream()
-        if bs is not None:
-            bs.close()
+        try:
+            file = self._source.getCharacterStream()
+            if file is not None:
+                file.close()
+        finally:
+            file = self._source.getByteStream()
+            if file is not None:
+                file.close()
 
     def _reset_cont_handler(self):
         self._parser.ProcessingInstructionHandler = \
diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py
--- a/Lib/xml/sax/saxutils.py
+++ b/Lib/xml/sax/saxutils.py
@@ -345,11 +345,14 @@
     elif hasattr(source, "read"):
         f = source
         source = xmlreader.InputSource()
-        source.setByteStream(f)
+        if isinstance(f.read(0), str):
+            source.setCharacterStream(f)
+        else:
+            source.setByteStream(f)
         if hasattr(f, "name") and isinstance(f.name, str):
             source.setSystemId(f.name)
 
-    if source.getByteStream() is None:
+    if source.getCharacterStream() is None and source.getByteStream() is None:
         sysid = source.getSystemId()
         basehead = os.path.dirname(os.path.normpath(base))
         sysidfilename = os.path.join(basehead, sysid)
diff --git a/Lib/xml/sax/xmlreader.py b/Lib/xml/sax/xmlreader.py
--- a/Lib/xml/sax/xmlreader.py
+++ b/Lib/xml/sax/xmlreader.py
@@ -117,7 +117,9 @@
         source = saxutils.prepare_input_source(source)
 
         self.prepareParser(source)
-        file = source.getByteStream()
+        file = source.getCharacterStream()
+        if file is None:
+            file = source.getByteStream()
         buffer = file.read(self._bufsize)
         while buffer:
             self.feed(buffer)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -16,6 +16,8 @@
 Library
 -------
 
+- Issue #2175: SAX parsers now support a character stream of InputSource object.
+
 - Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and
   arbitrary precision integers added in Tcl 8.5.
 

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list