intolerant HTML parser
Phlip
phlip2005 at gmail.com
Mon Feb 8 13:16:22 EST 2010
and the tweak is:
parser = etree.HTMLParser(recover=False)
return etree.HTML(xml, parser)
That reduces tolerance. The entire assert_xml() is (apologies for
wrapping lines!):
def _xml_to_tree(self, xml):
from lxml import etree
self._xml = xml
try:
if '<html' in xml[:200]: # NOTE the condition COULD suck
more!
parser = etree.HTMLParser(recover=False)
return etree.HTML(xml, parser)
return etree.HTML(xml)
else:
return etree.XML(xml)
except ValueError: # TODO don't rely on exceptions for
normal control flow
tree = xml
self._xml = str(tree) # CONSIDER does this reconstitute
the nested XML ?
return tree
def assert_xml(self, xml, xpath, **kw):
'Check that a given extent of XML or HTML contains a given
XPath, and return its first node'
tree = self._xml_to_tree(xml)
nodes = tree.xpath(xpath)
self.assertTrue(len(nodes) > 0, xpath + ' not found in ' +
self._xml)
node = nodes[0]
if kw.get('verbose', False): self.reveal_xml(node) # "here
have ye been? What have ye seen?"--Morgoth
return node
def reveal_xml(self, node):
'Spews an XML node as source, for diagnosis'
from lxml import etree
print etree.tostring(node, pretty_print=True) # CONSIDER
does pretty_print work? why not?
def deny_xml(self, xml, xpath):
'Check that a given extent of XML or HTML does not contain a
given XPath'
tree = self._xml_to_tree(xml)
nodes = tree.xpath(xpath)
self.assertEqual(0, len(nodes), xpath + ' should not appear in
' + self._xml)
More information about the Python-list
mailing list