Undocumented unescape() method in HTMLParser?
Tobiah
toby at tobiah.org
Fri May 25 11:49:17 EDT 2018
I came across its usage in StackOverflow somewhere, but didn't see
it in the docs. I'm using 2.7.
I needed it while writing a class for generating text documents out of
HTML documents for attaching to emails, which lowers spam scores. I lifted
the basis for this from the top answer here: https://tinyurl.com/yb92x8ra
While not complete, I thought it might be of interest. Improvements
welcome:
#####################################################
from HTMLParser import HTMLParser
def main():
parser = TextExtractor()
html = '''
<html><head>head</head><body>
<p>"Hi there!"</p>
<script> some javascript </script>
<style> class{style}</style>
<scrip>Print this</scrip>
<b><And this></b>
</body>
</html>
'''
print parser.strip_tags(html)
class TextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.silent_tag = None
self.fed = []
self.silent_tags = ['head', 'script', 'style']
def handle_starttag(self, tag, atts):
if tag in self.silent_tags:
self.silent_tag = tag
def handle_endtag(self, tag):
if tag == self.silent_tag:
self.silent_tag = None
def handle_data(self, d):
if not self.silent_tag:
self.fed.append(d)
def handle_entityref(self, name):
self.fed.append(self.unescape("&%s;" % name))
def get_data(self):
return ''.join(self.fed)
def strip_tags(self, html):
self.feed(html)
data = self.get_data()
self.fed = []
self.reset()
return data
main()
#####################################################
Output:
"Hi there!"
Print this
<And this>
Toby
More information about the Python-list
mailing list