Pls help me...I want to save data to my database but I am unable to
Max Cuban
edzeame at gmail.com
Sat Jan 25 14:00:41 EST 2014
This is my first programming pet project. I have the following script that
extracts links from specific sites and display them on the web(via django).
The script work fine but I'm unable to save any stuff in my database.
Hence if I run the code, I get the output I want but then it always
extracts only new content. I will rather want to have the content scrapped
earlier saved to the database so that on subsequent run, it only scrap and
append ONLY new links to the list.
[ ]
Any help will be appreciated.
[]
# Create your views here.
from django.template.loader import get_template
from django.core.paginator import Paginator, EmptyPage, PageNotAnInteger
from django.shortcuts import render_to_response
from django.template import Context
from bs4 import BeautifulSoup
import urllib2, sys
import urlparse
import re
from datetime import date, datetime
from listing.models import jobLinks
def businessghana():
site = "http://www.businessghana.com/portal/jobs"
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
for tag in soup.find_all('a', href = True):
tag['href'] = urlparse.urljoin('
http://www.businessghana.com/portal/', tag['href'])
return map(str, soup.find_all('a', href =
re.compile('.getJobInfo')))
def tonaton():
site = "http://tonaton.com/en/job-vacancies-in-ghana"
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
result = []
# next two lines make all the links in the soup absolute
for tag in soup.find_all('a', href=True):
tag['href'] = urlparse.urljoin('http://www.tonaton.com',
tag['href'])
# assign all 'h2' tags to 'jobs'. The 'h2'tag contains the required
links
jobs = soup.find_all('h2')
# Loop through the 'h2' tags and extract all the links
for h2 in soup.find_all('h2'):
n = h2.next_element
if n.name == 'a': result.append(str(n))
return result
def jobscomgh():
site = "http://jobs.com.gh"
hdr = {'User-Agent' : 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
jobpass = urllib2.urlopen(req)
soup = BeautifulSoup(jobpass)
return map(str, soup.find_all('a', href =
re.compile('.display-job')))
businessghana_links = businessghana()
tonaton_links = tonaton()
jobscomgh_links = jobscomgh()
def all_links():
return (businessghana_links + tonaton_links + jobscomgh_links)
def save_new_links(all_links):
current_links = jobLinks.objects.all()
for i in all_links:
if i not in current_links:
jobLinks.objects.create(url=i)
def this_week_links(all_links):
return jobLinks.objects.filter(date__gte =
datetime.timedelta(days=-7))
save_new_links(all_links)
this_week_links(all_links)
def display_links(request):
name = all_links()
paginator = Paginator(name, 25)
page = request.GET.get('page')
try:
name = paginator.page(page)
except PageNotAnInteger:
name = paginator.page(1)
except EmptyPage:
name = paginator.page(paginator.num_pages)
return render_to_response('jobs.html', {'name' : name})
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20140125/172e5279/attachment.html>
More information about the Python-list
mailing list