Hi Martijn and others,
please find attached a patch to lxml that improves I/O support. The
patch is not yet ready for inclusion but I would like to get some
feedback on this to see whether something similar might be included at a
later time. The attached patch implements
* Support for output format options. At the moment, the following
libxml2 options are supported: XML_SAVE_FORMAT, XML_SAVE_NO_DECL,
XML_SAVE_NO_EMPTY and XML_SAVE_NO_XHTML. These options are mapped to
friendly keyword arguments of the _ElementTree.write() function. This
functionality is in line with what was discussed recently on this list.
This functionality requires libxml2 2.6.22 with the patch attached below
that implements a documented but previously unimplemented function. I'm
submitting this patch to the libxml people.
* Support for string input/output. I've added a function parse_str() and
write_str() to _ElementTree. This functionality cannot currently be
achieved efficiently. The XML() function for example returns an element,
not a tree. The only alternative is to use intermediate StringIOs.
* Refactor ElementTree.parse(): Any file-like object passed in now will
be read completely into memory before it is parsed: I've removed the
short-cut where files accessed through their .name back door. This was a
bit of a hack as on Unix a file will not always have a name because you
can delete it while it is still open. The proper way to support
file-like objects would be to use the libxml context API where libxml2
would call the Python .read() members itself, through a provided adapter
function. Note that this is not in the patch at the moment, but if you
want I can have a go at it.
* Refactor ElementTree.parse(): removed the StringIO back door where its
value was accessed through .getvalue(). This was wrong IMHO, as a
StringIO is a true file-like object so lxml should use its .read()
method which takes into account things like file offset. Admittedly, 99%
of the cases you want to read the entire StringIO but I can definitely
think of situation where this is not the case and this would lead to
hard to debug problems.
* Refactor ElementTree.write(): the encoding argument is by default
'utf8', not 'us-ascii'. I would like to start the discussion on this.
The de-facto (and sometimes even formal) default encoding almost
anything things these days is UTF-8, so I don't see why ElementTree
should have 'us-ascii' (apart from compatibility, but see my other note
on that). Also, as libxml2 does everything in UTF-8 internally, that
seems a much better default encoding.
The attached patch breaks 7 tests at the moment but as far as I could
see there are all due to the changed default encoding and output format
options.
Let me know what you think.
Regards,
Geert
Index: src/lxml/etree.pyx
===================================================================
--- src/lxml/etree.pyx (revision 19574)
+++ src/lxml/etree.pyx (working copy)
@@ -154,74 +154,159 @@
c_ns = tree.xmlNewNs(c_node, href, prefix)
return c_ns
+cdef object dumpOutputBuffer(tree.xmlOutputBuffer* buf, encoding):
+ """Dump an XML output buffer to a Python string."""
+ cdef int size
+ cdef char* data
+
+ if buf.conv != NULL:
+ size = buf.conv.use
+ data = buf.conv.content
+ else:
+ size = buf.buffer.use
+ data = buf.buffer.content
+ if encoding is None:
+ s = fnunicode(data, size)
+ else:
+ s = tree.PyString_FromStringAndSize(data, size)
+ return s
+
+def getOutputOptions(format=False, declaration=True, empty=True, xhtml=True):
+ options = 0
+ if format is True:
+ options = options | tree.XML_SAVE_FORMAT
+ if declaration is False:
+ options = options | tree.XML_SAVE_NO_DECL
+ if empty is False:
+ options = options | tree.XML_SAVE_NO_EMPTY
+ if xhtml is False:
+ options = options | tree.XML_SAVE_NO_XHTML
+ return options
+
cdef class _ElementTree(_DocumentBase):
def parse(self, source, parser=None):
"""Updates self with the content of source and returns its root
"""
- # XXX ignore parser for now
cdef xmlDoc* c_doc
cdef xmlNode* c_node
- # XXX simplistic (c)StringIO support
- if hasattr(source, 'getvalue'):
- c_doc = theParser.parseDoc(source.getvalue())
+ if isinstance(source, basestring):
+ c_doc = theParser.parseDocFromFile(source)
+ elif hasattr(source, 'read'):
+ data = source.read()
+ c_doc = theParser.parseDoc(data)
else:
- filename = _getFilenameForFile(source)
- # Support for unamed file-like object (eg urlgrabber.urlopen)
- if not filename and hasattr(source, 'read'):
- c_doc = theParser.parseDoc(source.read())
- # Otherwise parse the file directly from the filesystem
- else:
- if filename is None:
- filename = source
- # open filename
- c_doc = theParser.parseDocFromFile(filename)
+ raise TypeError, 'expecting file name or file like object'
+
if self._c_doc is not NULL:
c_node = tree.xmlDocGetRootElement(self._c_doc)
- if (c_node is not NULL and
- not hasProxy(c_node) and
- canDeallocateChildren(c_node)):
+ if c_node is not NULL and not hasProxy(c_node) and \
+ canDeallocateChildren(c_node):
tree.xmlFreeDoc(self._c_doc)
self._c_doc = c_doc
return self.getroot()
-
+
+ def parse_str(self, source, parser=None):
+ """Updates self with the content of source and returns its root
+ """
+ cdef xmlDoc* c_doc
+ cdef xmlNode* c_node
+
+ if isinstance(source, unicode):
+ source = source.encode('UTF-8')
+
+ c_doc = theParser.parseDoc(source)
+
+ if self._c_doc is not NULL:
+ c_node = tree.xmlDocGetRootElement(self._c_doc)
+ if c_node is not NULL and not hasProxy(c_node) and \
+ canDeallocateChildren(c_node):
+ tree.xmlFreeDoc(self._c_doc)
+ self._c_doc = c_doc
+ return self.getroot()
+
def getroot(self):
cdef xmlNode* c_node
c_node = tree.xmlDocGetRootElement(self._c_doc)
if c_node is NULL:
return None
return _elementFactory(self, c_node)
-
- def write(self, file, encoding='us-ascii'):
- cdef tree.xmlSaveCtxt* save_ctxt
- cdef char* mem
- cdef int size
-
- # recognize a diversity of ways to spell this in Python
+
+ # Default values for keyword arguments as in libxml2.
+ def write(self, file, encoding='utf8', format=False,
+ declaration=True, empty=True, xhtml=True):
+ cdef int options
+ cdef tree.xmlSaveCtxt* ctxt
+ cdef tree.xmlOutputBuffer* buffer
+ cdef tree.xmlCharEncodingHandler* encoder
+
+ # Encoding must be set, as we may be directly so a file which
+ # requires an encoding.
+ if not isinstance(encoding, basestring):
+ raise TypeError, 'encoding must be a string'
+ if not encoding:
+ raise ValueError, 'an encoding must be specified'
+
if encoding in ('UTF-8', 'utf8', 'UTF8', 'utf-8'):
- encoding = 'UTF-8'
+ xml_encoding = 'UTF-8'
+ else:
+ xml_encoding = encoding
+ encoder = tree.xmlFindCharEncodingHandler(encoding)
+ if encoder == NULL:
+ raise ValueError, 'unkown encoding: %s' % xml_encoding
- if not hasattr(file, 'write'):
- # file is a filename, we want a file object
- file = open(file, 'wb')
+ options = getOutputOptions(format, declaration, empty, xhtml)
- tree.xmlDocDumpMemoryEnc(self._c_doc, &mem, &size, encoding)
- m = mem
- # XXX this is purely for ElementTree compatibility..
- if encoding == 'UTF-8' or encoding == 'us-ascii':
- # strip off XML prologue..
- i = m.find('\n')
- if i != -1:
- m = m[i + 1:]
- # strip off ending \n..
- m = m[:-1]
- if encoding == 'UTF-8':
- file.write(m)
+ if isinstance(file, basestring):
+ ctxt = tree.xmlSaveToFilename(file, encoding, options)
+ tree.xmlSaveDoc(ctxt, self._c_doc)
+ tree.xmlSaveFlush(ctxt)
+ tree.xmlSaveClose(ctxt)
+ elif hasattr(file, 'write'):
+ buffer = tree.xmlAllocOutputBuffer(encoder)
+ ctxt = tree.xmlSaveToBuffer(buffer, encoding, options)
+ tree.xmlSaveDoc(ctxt, self._c_doc)
+ tree.xmlSaveFlush(ctxt)
+ s = dumpOutputBuffer(buffer, encoding)
+ file.write(s)
+ tree.xmlSaveClose(ctxt)
else:
- file.write(funicode(m).encode(encoding))
- tree.xmlFree(mem)
-
+ raise TypeError, 'expecting file name or file like object'
+
+ def write_str(self, encoding=None, format=False, declaration=True,
+ empty=True, xhtml=True):
+ cdef int options
+ cdef tree.xmlSaveCtxt* ctxt
+ cdef tree.xmlOutputBuffer* buffer
+ cdef tree.xmlCharEncodingHandler* encoder
+
+ # Encoding can be None. In this case, a plain or unicode string is
+ # returned depending on whether the string has non-ascii chars.
+ if encoding is not None and not isinstance(encoding, basestring):
+ raise TypeError, 'encoding must be None or a string'
+
+ if encoding is None:
+ xml_encoding = 'UTF-8'
+ elif encoding in ('UTF-8', 'utf8', 'UTF8', 'utf-8'):
+ xml_encoding = 'UTF-8'
+ else:
+ xml_encoding = encoding
+
+ encoder = tree.xmlFindCharEncodingHandler(xml_encoding)
+ if encoder == NULL:
+ raise ValueError, 'unkown encoding: %s' % xml_encoding
+
+ options = getOutputOptions(format, declaration, empty, xhtml)
+
+ buffer = tree.xmlAllocOutputBuffer(encoder)
+ ctxt = tree.xmlSaveToBuffer(buffer, xml_encoding, options)
+ tree.xmlSaveDoc(ctxt, self._c_doc)
+ tree.xmlSaveFlush(ctxt)
+ s = dumpOutputBuffer(buffer, encoding)
+ tree.xmlSaveClose(ctxt)
+ return s
+
def getiterator(self, tag=None):
root = self.getroot()
if root is None:
@@ -1110,6 +1195,14 @@
tree.parse(source, parser=parser)
return tree
+def parse_str(source, parser=None):
+ """Return an ElementTree object loaded with source elements
+ """
+ cdef _ElementTree tree
+ tree = _elementTreeFactory(NULL)
+ tree.parse_str(source, parser=parser)
+ return tree
+
cdef _addNamespaces(xmlDoc* c_doc, xmlNode* c_node, object nsmap):
cdef xmlNs* c_ns
if nsmap is None:
@@ -1840,6 +1933,11 @@
return tree.PyUnicode_DecodeUTF8(s, tree.strlen(s), "strict")
return tree.PyString_FromStringAndSize(s, tree.strlen(s))
+cdef object fnunicode(char* s, int len):
+ if isutf8(s):
+ return tree.PyUnicode_DecodeUTF8(s, len, "strict")
+ return tree.PyString_FromStringAndSize(s, len)
+
cdef object _namespacedName(xmlNode* c_node):
if c_node.ns is NULL or c_node.ns.href is NULL:
return funicode(c_node.name)
Index: src/lxml/tree.pxd
===================================================================
--- src/lxml/tree.pxd (revision 19574)
+++ src/lxml/tree.pxd (working copy)
@@ -102,7 +102,10 @@
xmlNode* prev
xmlDoc* doc
- ctypedef struct xmlBuffer
+ ctypedef struct xmlBuffer:
+ char* content
+ unsigned int use
+ unsigned int size
ctypedef struct xmlOutputBuffer:
xmlBuffer* buffer
@@ -167,12 +170,23 @@
cdef int xmlOutputBufferClose(xmlOutputBuffer* out)
cdef extern from "libxml/xmlsave.h":
+
+ ctypedef enum xmlSaveOption:
+ XML_SAVE_FORMAT = 1
+ XML_SAVE_NO_DECL = 2
+ XML_SAVE_NO_EMPTY = 4
+ XML_SAVE_NO_XHTML = 8
+
ctypedef struct xmlSaveCtxt:
pass
-
+
cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding,
int options)
+ cdef xmlSaveCtxt* xmlSaveToBuffer(xmlOutputBuffer* buffer, char* encoding,
+ int options)
+
cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc)
+ cdef int xmlSaveFlush(xmlSaveCtxt* ctxt)
cdef int xmlSaveClose(xmlSaveCtxt* ctxt)
cdef extern from "libxml/xmlstring.h":
diff -ur libxml2-2.6.22.orig/xmlsave.c libxml2-2.6.22/xmlsave.c
--- libxml2-2.6.22.orig/xmlsave.c 2005-09-12 17:35:07.000000000 +0200
+++ libxml2-2.6.22/xmlsave.c 2005-11-06 14:59:49.000000000 +0100
@@ -1473,13 +1473,18 @@
* with the encoding and the options given
*
* Returns a new serialization context or NULL in case of error.
+ */
+
xmlSaveCtxtPtr
xmlSaveToBuffer(xmlBufferPtr buffer, const char *encoding, int options)
{
- TODO
- return(NULL);
+ xmlSaveCtxtPtr ret;
+
+ ret = xmlNewSaveCtxt(encoding, options);
+ if (ret == NULL) return(NULL);
+ ret->buf = buffer;
+ return(ret);
}
- */
/**
* xmlSaveToIO: