How to do this in python with regular expressions

snorble at hotmail.com snorble at hotmail.com
Sun May 27 18:11:44 EDT 2007


On May 25, 6:51 am, Jia Lu <Roka... at gmail.com> wrote:
> Hi all
>
>  I'm trying to parsing html with re module.
>
>  html = """
>  <TABLE BORDER=1 cellspacing=0 cellpadding=2>
> <TR>
>
> <TH nowrap>DATA1</TH><TH nowrap>DATA2</HT><TH nowrap>DATA3</
> HT><TH>DATA4</TH>
> </TR>
>
> <TR><TD>DATA5</TD><TD>DATA6</TD><TD>DATA7</TD><TD>DATA8</TD></TR>
>
> </TABLE>
> """
>
> I want to get DATA1-8 from that string.(DATA maybe not english words.)
> Can anyone tell me how to do it with regular expression in python?
>
> Thank you very much.



# example1.py
# This example will print out more than what's in the HTML table. It
would also print
# out text between <body></body> tags, and so on.

import HTMLParser

class DataParser(HTMLParser.HTMLParser):
    def handle_data (self, data):
        data = data.strip()
        if data:
            print data

html = '''
<TABLE BORDER=1 cellspacing=0 cellpadding=2>
<TR>

<TH nowrap>DATA1</TH><TH nowrap>DATA2</HT><TH nowrap>DATA3</
HT><TH>DATA4</TH>
</TR>

<TR><TD>DATA5</TD><TD>DATA6</TD><TD>DATA7</TD><TD>DATA8</TD></TR>

</TABLE>
'''

parser = DataParser()
parser.feed(html)
parser.close()



example1.py output:

$ python example1.py
DATA1
DATA2
DATA3
DATA4
DATA5
DATA6
DATA7
DATA8



# example2.py
# This example uses the re module to pull out only the table portions
of HTML. This
# should only print out data between <table></table> tags. Notice that
there is some
# data between the <body></body> tags that is not present in the
output.

import HTMLParser
import re

class DataParser(HTMLParser.HTMLParser):
    def handle_data (self, data):
        data = data.strip()
        if data:
            print data

html = '''
<html>
  <head></head>
  <body>
    body data 1
    <table>
          <tr><td>table 1 data 1</td></tr>
          <tr><td>table 1 data 2</td></tr>
    </table>

    <table>
          <tr><td>table 2 data 1</td></tr>
          <tr><td>table 2 data 2</td></tr>
    </table>
    body data 2
  </body>
</html>
'''

tables_list = re.findall('<table>.*?</table>', html, re.DOTALL |
re.IGNORECASE)
tables_html = str.join(' ', tables_list)

parser = DataParser()
parser.feed(tables_html)
parser.close()



example2.py output:

$ python example2.py
table 1 data 1
table 1 data 2
table 2 data 1
table 2 data 2



# example3.py
# This example does basically the same thing as example2.py, but it
uses HTMLParser
# to keep track of whether the data is between <table></table> tags.

import HTMLParser

class DataParser(HTMLParser.HTMLParser):
    def __init__ (self):
        HTMLParser.HTMLParser.__init__(self)
        self.table_count = 0
    def handle_starttag (self, tag, attrs):
        if tag == 'table':
            self.table_count += 1
    def handle_endtag (self, tag):
        if tag == 'table':
            self.table_count -= 1
    def handle_data (self, data):
        data = data.strip()
        if data and self.table_count > 0:
            print data

html = '''
<html>
  <head></head>
  <body>
    body data 1
    <table>
          <tr><td>table 1 data 1</td></tr>
          <tr><td>table 1 data 2</td></tr>
    </table>

    <table>
          <tr><td>table 2 data 1</td></tr>
          <tr><td>table 2 data 2</td></tr>
    </table>
    body data 2
  </body>
</html>
'''

parser = DataParser()
parser.feed(html)
parser.close()



example3.py output:

$ python example3.py
table 1 data 1
table 1 data 2
table 2 data 1
table 2 data 2




More information about the Python-list mailing list