Hi,
basically, HTML comments are currently handled like elements in html.diff's flatten_el, which leads to interesting results:
>>> from lxml.html.diff import htmldiff
>>>
>>> a = '<p>test</p>'
>>> b = '<p>test<!-- Hey --></p>'
>>>
>>> print(htmldiff(a,b))
<p>test <ins><<cyfunction comment at>>Hey <cyfunction comment at>></cyfunction></cyfunction></ins> </p>
I added a check to not include comments in the generated token list, which effectively strips all comments from the diff:
diff --git a/src/lxml/html/diff.py b/src/lxml/html/diff.py
index 5d143bd2..9d4a4f72 100644
--- a/src/lxml/html/diff.py
+++ b/src/lxml/html/diff.py
@@ -4,7 +4,7 @@ from __future__ import absolute_import
import difflib
from lxml import etree
-from lxml.html import fragment_fromstring
+from lxml.html import fragment_fromstring, HtmlComment
import re
__all__ = ['html_annotate', 'htmldiff']
@@ -688,6 +688,14 @@ def flatten_el(el, include_hrefs, skip_tag=False):
If skip_tag is true, then the outermost container tag is
not returned (just its contents)."""
+
+ if isinstance(el, HtmlComment):
+ if el.tail:
+ end_words = split_words(el.tail)
+ for word in end_words:
+ yield html_escape(word)
+ return
+
if not skip_tag:
if el.tag == 'img':
yield ('img', el.get('src'), start_tag(el))
jens