[XML-SIG] HTML parse error
sharifah ummu kulthum
kulthum91 at gmail.com
Tue Feb 23 04:45:36 CET 2010
On Mon, Feb 22, 2010 at 10:46 PM, Stefan Behnel <stefan_ml at behnel.de> wrote:
> sharifah ummu kulthum, 22.02.2010 14:24:
> > File "grabmy.py", line 63, in get_html
> > return BeautifulSoup(content)
> > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1499, in
> __init__
> > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1230, in
> __init__
> > File "build/bdist.linux-i686/egg/BeautifulSoup.py", line 1263, in _feed
> > File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
> > self.goahead(0)
> > File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
> > k = self.parse_starttag(i)
> > File "/usr/lib/python2.6/HTMLParser.py", line 226, in parse_starttag
> > endpos = self.check_for_whole_start_tag(i)
> > File "/usr/lib/python2.6/HTMLParser.py", line 301, in
> > check_for_whole_start_tag
> > self.error("malformed start tag")
> > File "/usr/lib/python2.6/HTMLParser.py", line 115, in error
> > raise HTMLParseError(message, self.getpos())
> > HTMLParser.HTMLParseError: malformed start tag, at line 830, column 36
>
> Just noticed this now - you seem to be using BeautifulSoup, likely version
> 3.1. This version does not support parsing broken HTML any well, so use
> version 3.0.8 instead, or switch to the tools I indicated.
>
> Note that switching tools means that you need to change your code to use
> them. Just installing them is not enough.
>
> Stefan
>
>
I am so sorry but I really don't know how to change the code as I have just
learn python. How am I going to switch the version or to change the code?
Because I don't really understand the code.
Here is the code:
'''
Copyright (c) 2008 Yap Sok Ann <sayap at sayap.com>
This module contains xmltv grabbers for Malaysia channels.
'''
__author__ = 'Yap Sok Ann <sayap at sayap.com>'
__license__ = 'PSF License'
import logging
from datetime import date as dt
from datetime import datetime, time, timedelta
from dateutil.tz import tzlocal
from httplib2 import Http
from lxml import etree
from urllib import urlencode
from BeautifulSoup import BeautifulSoup
channels = ['rtm1', 'rtm2', 'tv3', 'ntv7', '8tv', 'tv9']
datetime_format = '%Y%m%d%H%M%S %z'
h = Http()
h.force_exception_to_status_code = True
#h.timeout = 15
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(levelname)-8s %(process)d %(message)s',
)
log = logging.getLogger(__name__)
def strclean(s):
s = s.strip().replace('‘', '\'').replace('’', '\'')
if s != ' ':
return s
class Grabber(object):
base_url = None
def __init__(self, channel):
self.channel = channel
self.url = self.base_url
def qs_params(self, date, **kwargs):
'''Returns a dict of params to form the url's query string
'''
raise NotImplementedError
def _parse_html(self, date, html):
'''Returns a list of dicts with the following keys:
- mandatory: title, start
- optional: stop, sub_title, desc, episode_number, episode_system
'''
raise NotImplementedError
def get_html(self, date, **kwargs):
params = self.qs_params(date, **kwargs)
response, content = h.request(self.url + '?' + urlencode(params))
if response.status == 200:
return BeautifulSoup(content)
else:
log.error('Status: %s\nContent: %s' % (response.status,
content))
def parse_html(self, date, html):
prev_schedule = None
try:
for schedule in self._parse_html(date, html):
if 'stop' in schedule:
yield schedule
elif prev_schedule:
prev_schedule['stop'] = schedule['start']
yield prev_schedule
prev_schedule = schedule
except:
log.exception('Cannot parse html for date %s' % date)
def to_xml(self, schedules):
for schedule in schedules:
program = etree.Element('programme', channel=self.channel,
start=schedule['start'].strftime(datetime_format),
stop=schedule['stop'].strftime(datetime_format))
title = etree.SubElement(program, 'title')
title.text = schedule['title']
if schedule.get('episode_num'):
episode_num = etree.SubElement(program, 'episode-num')
episode_num.set('system', schedule.get('episode_system'))
episode_num.text = schedule['episode_num']
for field in ['sub_title', 'desc']:
if schedule.get(field):
elem = etree.SubElement(program, field.replace('_',
'-'))
elem.text = schedule[field]
yield program
def grab(self, date, **kwargs):
html = self.get_html(date, **kwargs)
if html:
return self.to_xml(self.parse_html(date, html))
class Astro(Grabber):
base_url = 'http://www.astro.com.my/channels/%(channel)s/Default.asp'
params_dicts = [dict(batch=1),
dict(batch=2)]
ignores = ['No Transmission', 'Transmission Ends']
def __init__(self, channel):
self.channel = channel
self.url = self.base_url % dict(channel=channel)
def qs_params(self, date, **kwargs):
kwargs['sDate'] = date.strftime('%d-%b-%Y')
return kwargs
def _parse_html(self, date, html):
header_row = html.find('tr', bgcolor='#29487F')
for tr in header_row.fetchNextSiblings('tr'):
tds = tr.findChildren('td')
title = strclean(tds[1].find('a').string)
if title in self.ignores:
continue
# start time, '21:00' -> 9 PM
hour, minute = [int(x) for x in tds[0].string.split(':')]
start = datetime.combine(date,
time(hour, minute, tzinfo=tzlocal()))
# duration, '00:30' -> 30 minutes
hours, minutes = [int(x) for x in tds[2].string.split(':')]
stop = start + timedelta(hours=hours, minutes=minutes)
yield dict(title=title, start=start, stop=stop)
class TheStar(Grabber):
base_url = 'http://star-ecentral.com/tvnradio/tvguide/guide.asp'
params_dicts = [dict(db='live')]
def qs_params(self, date, **kwargs):
kwargs['pdate'] = date.strftime('%m/%d/%Y')
kwargs['chn'] = self.channel.replace('rtm', 'tv')
return kwargs
def _parse_html(self, date, html):
last_ampm = None
header_row = html.find('tr', bgcolor='#5e789c')
for tr in header_row.fetchNextSiblings('tr'):
tds = tr.findChildren('td')
schedule = {}
schedule['title'] =
strclean(tds[1].find('b').find('font').string)
schedule['desc'] = strclean(tds[2].find('font').string)
episode_num = strclean(tds[3].find('font').string)
if episode_num:
try:
episode_num = int(episode_num) - 1
episode_num = '.' + str(episode_num) + '.'
episode_system = 'xmltv_ns'
except ValueError:
episode_system = 'onscreen'
schedule['episode_num'] = episode_num
schedule['episode_system'] = episode_system
# start time, '9.00pm' -> 9 PM
time_str = tds[0].find('font').string
ampm = time_str[-2:]
hour, minute = [int(x) for x in time_str[:-2].split('.')]
if ampm == 'pm' and hour < 12:
hour += 12
elif ampm =='am' and hour == 12:
hour = 0
if last_ampm == 'pm' and ampm == 'am':
date = date + timedelta(1)
schedule['start'] = datetime.combine(
date, time(hour, minute, tzinfo=tzlocal()))
last_ampm = ampm
yield schedule
def main():
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-s', '--source', dest='source',
help='SOURCE to grab from: Astro, TheStar. Default: TheStar')
parser.add_option('-d', '--date', dest='date',
help='Start DATE to grab schedules for (YYYY-MM-DD). Default:
today')
parser.add_option('-n', '--days', dest='days',
help='Number of DAYS to grab schedules for. Default: 1')
parser.add_option('-f', '--file', dest='filename', metavar='FILE',
help='Output FILE to write to. Default: stdout')
options, args = parser.parse_args()
if options.source is None:
cls = TheStar
else:
cls = globals()[options.source]
if options.date is None:
date = dt.today()
else:
date = dt(*[int(x) for x in options.date.split('-')])
if options.days is None:
days = 1
else:
days = int(options.days)
root = etree.Element('tv')
for channel in channels:
grabber = cls(channel)
for i in range(days):
for params_dict in cls.params_dicts:
for elem in grabber.grab(date + timedelta(i),
**params_dict):
root.append(elem)
xml = etree.tostring(root, encoding='UTF-8', xml_declaration=True,
pretty_print=True)
if options.filename is None:
print xml
else:
open(options.filename, 'w').write(xml)
if __name__ == '__main__':
main()
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/xml-sig/attachments/20100223/d3aa2fd4/attachment-0001.html>
More information about the XML-SIG
mailing list