[Tutor] extract uri from beautiful soup string

Norman Khine norman at khine.net
Sun Oct 14 20:53:16 CEST 2012


ignore, i got it:

get_url = re.compile(r"""window.open\('(.*)','','toolbar=0,""",
re.DOTALL).findall


...

			get_onclick = str(soup('a')[0]['onclick']) # get the 'onclick' attribute
			urls = get_url(get_onclick)
			print assoc_name, urls, assoc_theme

returns

Amiral ['http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815']
Culture


On Sun, Oct 14, 2012 at 7:05 PM, Norman Khine <norman at khine.net> wrote:
> hello, i have this code:
>
>
> #!/usr/local/bin/python
> # -*- coding: utf-8 -*-
>
> import re
> import urllib2
> import BeautifulSoup
>
> origin_site = 'http://DOMAIN.TLD/index.php?id=annuaire_assos&theme=0&rech=&num_page='
>
> pages = range(1,3)
>
> for page_no in pages:
>         print '====== %s' % page_no
>         req = ('%s%s' % (origin_site, page_no))
>         user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
>         headers = { 'User-Agent' : user_agent }
>         items = []
>         try:
>                 urllib2.urlopen(req)
>         except urllib2.URLError, e:
>                 pass
>         else:
>                 # do something with the page
>                 doc = urllib2.urlopen(req)
>                 soup = BeautifulSoup.BeautifulSoup(doc)
>                 infoblock = soup.findAll('tr', { "class" : "menu2" })
>                 for item in infoblock:
>                         soup = BeautifulSoup.BeautifulSoup(str(item))
>                         for tag in soup.recursiveChildGenerator():
>                                 if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('td'):
>                                         if tag.string is not None:
>                                                 assoc_name = (tag.string)
>                                 if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('u'):
>                                         if tag.string is not None:
>                                                 assoc_theme = (tag.string)
>
>                         get_onclick = soup('a')[0]['onclick'] # get the 'onclick' attribute
>                         print assoc_name, get_onclick, assoc_theme
>
>
> this returns the following:
>
> Amiral window.open('http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815','','toolbar=0,menubar=0,location=0,scrollbars=1,top=80,left=400,width=500,height=400');return
> false Culture
>
> how do i extract from the get_onclick the
> 'http://DOMAIN.TLD/extranet/associations/detail-assos.php?id=3815'
> correctly?
>
> Any advise much appreciated.
>
>
>
> --
> %>>> "".join( [ {'*':'@','^':'.'}.get(c,None) or
> chr(97+(ord(c)-83)%26) for c in ",adym,*)&uzq^zqf" ] )



-- 
%>>> "".join( [ {'*':'@','^':'.'}.get(c,None) or
chr(97+(ord(c)-83)%26) for c in ",adym,*)&uzq^zqf" ] )


More information about the Tutor mailing list