import lxml.html.clean
html = """<html>
<head>
<meta name="keywords" content="test">
</head>
</html>"""
def clean_html(html):
"""Removes parts of HTML unnecessary for processing."""
kill_tags = ["map", "base", "iframe", "select", "noscript"]
kwargs = {"scripts": True,
"javascript": True,
"comments": True,
"style": True,
"links": True,
"meta": False,
"page_structure": False,
"processing_instructions": True,
"embedded": True,
"frames": False,
"forms": False,
"annoying_tags": True,
"kill_tags": kill_tags,
"whitelist_tags": ["meta"]}
cleaner = lxml.html.clean.Cleaner(**kwargs)
cleaned = cleaner.clean_html(unicode(html))
return cleaned
print clean_html(html)