[Tutor] customizing dark_harvest problems

Jason Willis chaoticslacker at gmail.com
Thu Apr 7 20:51:29 EDT 2016


Hi ,

I am a complete noob when it comes to python and programming in general.
Though, I do know some things and can figure a little bit out when looking
at source code I'm usually at a loss when understanding the entire workings
of a program. Any and all help provided here would be greatly appreciated
and will further my journey into learning how to code in python. Which will
be the first language I learn.

Recently I found a program that does a marvelous job at what I'm trying to
do. That program is called grey_harvest. It harvests proxies from one
website and spits them out by country and other arguments.

What i understand from the code so far is that the website being queried is
http://freeproxylists.com . This information is put into a variable. Also,
the proxies are being read from the "elite.html" web page that is on this
website. This information is also put into a variable. What i would like to
do is change the second variable , "elite.html" , to "standard.html" and
have the program work exactly the same way. Even though, from my
understanding, the both web pages are structured identically the program
does not work when I change the variable. What am I missing here? I haven't
the slightest idea. Hopefully someone here is willing to help me along.

Thanks!


The code is , I think, short and sweet. It is as follows:


"""
''' File:        grey_harvest.py
''' Author:      s0lst1c3
''' Created:     Tue May 26 2015
''' Source:      https://github.com/s0lst1c3/grey_harvest
''' License:     MIT (see attached)
''' Description: Scrapes the web for reliable http or https proxies and
prints
'''              them  to  stdout.  Can  also be  used as a  python library
to
'''              easily  generate  reliable  proxies  for  use  within
Python
'''              application (see README.md).
"""

__version__   = '0.1.1'
__author__    = 'John "s0lst1c3" Ryan'
__license__   = 'MIT'
__copyright__ = 'Copyright (c) 2015 John Ryan'

import requests
import socket
import sys
import argparse
from time import sleep
from bs4  import BeautifulSoup
from lxml import etree

''' configs '''
DOC_ROOT          = 'http://freeproxylists.com'
ELITE_PAGE        = 'elite.html'
HTTPS_ONLY        = True
ALLOWED_COUNTRIES = None
DENIED_COUNTRIES  = ['China']
MAX_TIMEOUT       = 1
TEST_SLEEPTIME    = 1
TEST_DOMAIN       = 'example.com'

class Proxy(dict):

    def __init__(self, ip, port, country=None,
            latency=None, https=False, last_checked=None):

        dict.__init__(self)
        self.ip = ip
        self.port = int(port)
        self.country = country
        self.latency = int(latency)
        self.https = https
        self['ip'] = ip
        self['port'] = port
        self['country'] = country
        self['latency'] = latency
        self['https'] = https

    def test(self,
            test_domain=TEST_DOMAIN,
            test_sleeptime=TEST_SLEEPTIME,
            max_timeout=MAX_TIMEOUT):

                ''' get ready for test '''
                protocol = 'https' if self['https'] else 'http'
                test_url = '%s://%s' % (protocol, test_domain)
                proxies = {
                    'https://%s' : str(self),
                    'http://%s' : str(self),
                }

                ''' make a brief HEAD request to test_domain and see if it
times out '''
                requests.head(test_url, timeout=max_timeout,
proxies=proxies)
                try:
                    response = requests.head(test_url, timeout=max_timeout,
proxies=proxies)
                    if test_sleeptime > 0:
                        sleep(test_sleeptime)
                    return True
                except requests.exceptions.ConnectionError:
                    if test_sleeptime > 0:
                        sleep(test_sleeptime)
                    return False

    def __str__(self):
        return '%s:%s' % (self.ip, self.port)

class GreyHarvester(object):

    def __init__(self,
            test_domain=TEST_DOMAIN,
            test_sleeptime=TEST_SLEEPTIME,
            https_only=HTTPS_ONLY,
            allowed_countries=ALLOWED_COUNTRIES,
            denied_countries=DENIED_COUNTRIES,
            max_timeout=MAX_TIMEOUT):

                self.allowed_countries = allowed_countries
                self.denied_countries = denied_countries
                self.max_timeout = max_timeout
                self.test_sleeptime = test_sleeptime
                self.test_domain = test_domain
                self.https_only = https_only

    def run(self):
        for endpoint in self._extract_ajax_endpoints():
            for proxy in self._extract_proxies(endpoint):
                if self._passes_filter(proxy) and proxy.test(
                        test_domain=self.test_domain,
                        test_sleeptime=self.test_sleeptime,
                        max_timeout = self.max_timeout,
                    ) == True: yield proxy

    def _extract_proxies(self, ajax_endpoint):

        ''' request the xml object '''
        proxy_xml = requests.get(ajax_endpoint)

        root = etree.XML(str(proxy_xml.text))
        quote = root.xpath('quote')[0]

        ''' extract the raw text from the body of the quote tag '''
        raw_text = quote.text

        ''' eliminate the stuff we don't need '''
        proxy_data = raw_text.split('You will definitely love it! Give it a
try!</td></tr>')[1]

        ''' get rid of the </table> at the end of proxy_data '''
        proxy_data = proxy_data[:-len('</table>')]

        ''' split proxy_data into rows '''
        table_rows = proxy_data.split('<tr>')

        ''' convert each row into a Proxy object '''
        for row in table_rows:

            ''' get rid of the </tr> at the end of each row '''
            row = row[:-len('</tr>')]

            ''' split each row into a list of items '''
            items = row.split('<td>')

            ''' sometimes we get weird lists containing only an empty
string '''
            if len(items) != 7:
                continue

            ''' we'll use this to remove the </td> from the end of each
item '''
            tdlen = len('</td>')

            ''' create proxy dict '''
            proxy = Proxy(
                ip=items[1][:-tdlen],
                port=int(items[2][:-tdlen]),
                https=bool(items[3][:-tdlen]),
                latency=int(items[4][:-tdlen]),
                last_checked=items[5][:-tdlen],
                country=items[6][:-tdlen],
            )
            yield proxy

    def _passes_filter(self, proxy):

        ''' avoid redudant and space consuming calls to 'self' '''

        ''' validate proxy based on provided filters '''
        if self.allowed_countries is not None and proxy['country'] not in
self.allowed_countries:
            return False
        if self.denied_countries is not None and  proxy['country'] in
self.denied_countries:
            return False
        if self.https_only and proxy['https'] == False:
            return False
        return True

    def _extract_ajax_endpoints(self):

        ''' make a GET request to freeproxylists.com/elite.html '''
        url = '/'.join([DOC_ROOT, ELITE_PAGE])
        response = requests.get(url)

        ''' extract the raw HTML doc from the response '''
        raw_html = response.text

        ''' convert raw html into BeautifulSoup object '''
        soup = BeautifulSoup(raw_html)

        for url in soup.select('table tr td table tr td a'):
            if 'elite #' in url.text:
                yield '%s/load_elite_d%s' % (DOC_ROOT,
url['href'].lstrip('elite/'))

def setup(parser):

    parser.add_argument('-a', '--allowed-countries',
                    dest='allowed_countries',
                    nargs='*',
                    metavar='<country>',
                    required=False,
                    default=ALLOWED_COUNTRIES,
                    help='''Only use proxies physically located in the
specified countries.'''
    )
    parser.add_argument('-d', '--denied-countries',
                    dest='denied_countries',
                    nargs='*',
                    metavar='<country_1>',
                    default=DENIED_COUNTRIES,
                    required=False,
                    help='Do not use proxies physically located these
countries. This flag takes precedence over --allowed-countries.'''
    )
    parser.add_argument('-t', '--max-timeout',
                    dest='max_timeout',
                    nargs=1,
                    type=int,
                    metavar='<N>',
                    default=MAX_TIMEOUT,
                    required=False,
                    help='Discard proxies that do not respond within <N>
seconds of HEAD request.'
    )
    parser.add_argument('-H', '--https-only',
                    action='store_true',
                    dest='https_only',
                    default=HTTPS_ONLY,
                    help='Only keep proxies with https support.',
    )
    parser.add_argument('-D', '--test-domain',
                    dest='test_domain',
                    nargs=1,
                    metavar='<test_domain>',
                    default=TEST_DOMAIN,
                    required=False,
                    help='Test proxies by making HEAD request to <test
domain>',
    )
    parser.add_argument('-n', '--num-proxies',
                    dest='num_proxies',
                    nargs=1,
                    type=int,
                    metavar='<N>',
                    required=True,
                    help='Harvest <N> working and free proxies from teh
interwebz',
    )

    args = parser.parse_args()

    return {
        'num_proxies' : args.num_proxies[0],
        'test_domain' : args.test_domain,
        'https_only' : args.https_only,
        'max_timeout' : args.max_timeout,
        'allowed_countries' : args.allowed_countries,
        'denied_countries' : args.denied_countries,
    }

def main():

    ''' set things up '''
    configs =  setup(argparse.ArgumentParser())
    harvester = GreyHarvester(
        test_domain=configs['test_domain'],
        test_sleeptime=TEST_SLEEPTIME,
        https_only=configs['https_only'],
        allowed_countries=configs['allowed_countries'],
        denied_countries=configs['denied_countries'],
        max_timeout=configs['max_timeout']
    )

    ''' harvest free and working proxies from teh interwebz '''
    count = 0
    for proxy in harvester.run():
        if count >= configs['num_proxies']:
            break
        print proxy
        count += 1

if __name__ == '__main__':
    main()


What i understand from the code so far is that the website being queried is
http://freeproxylists.com . This information is put into a variable. Also,
the proxies are being read from the "elite.html" web page that is on this
website. This information is also put into a variable. What i would like to
do is change the second variable , "elite.html" , to "standard.html" and
have the program work exactly the same way. Even though, from my
understanding, the both web pages are structured identically the program
does not work when I change the variable. What am I missing here? I haven't
the slightest idea. Hopefully someone here is willing to help me along.

Thanks!


More information about the Tutor mailing list