Comments on my first script?
Chris
cwitts at gmail.com
Thu Jun 12 11:29:28 EDT 2008
On Jun 12, 4:27 pm, Phillip B Oldham <phillip.old... at gmail.com> wrote:
> I'm keen on learning python, with a heavy lean on doing things the
> "pythonic" way, so threw the following script together in a few hours
> as a first-attempt in programming python.
>
> I'd like the community's thoughts/comments on what I've done;
> improvements I can make, "don'ts" I should be avoiding, etc. I'm not
> so much bothered about the resulting data - for the moment it meets my
> needs. But any comment is welcome!
>
> #!/usr/bin/env python
> ## Open a file containing a list of domains (1 per line),
> ## request and parse it's whois record and push to a csv
> ## file.
>
> import subprocess
> import re
>
> src = open('./domains.txt')
>
> dest = open('./whois.csv', 'w');
>
> sep = "|"
> headers = ["Domain","Registrant","Registrant's
> Address","Registrar","Registrant Type","Date Registered","Renewal
> Date","Last Updated","Name Servers"]
>
> dest.write(sep.join(headers)+"\n")
>
> def trim( txt ):
> x = []
> for line in txt.split("\n"):
> if line.strip() == "":
> continue
> if line.strip().startswith('WHOIS'):
> continue
> if line.strip().startswith('>>>'):
> continue
> if line.strip().startswith('%'):
> continue
> if line.startswith("--"):
> return ''.join(x)
> x.append(" "+line)
> return "\n".join(x)
>
> def clean( txt ):
> x = []
> isok = re.compile("^\s?([^:]+): ").match
> for line in txt.split("\n"):
> match = isok(line)
> if not match:
> continue
> x.append(line)
> return "\n".join(x);
>
> def clean_co_uk( rec ):
> rec = rec.replace('Company number:', 'Company number -')
> rec = rec.replace("\n\n", "\n")
> rec = rec.replace("\n", "")
> rec = rec.replace(": ", ":\n")
> rec = re.sub("([^(][a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
> rec = rec.replace(":\n", ": ")
> rec = re.sub("^[ ]+\n", "", rec)
> return rec
>
> def clean_net( rec ):
> rec = rec.replace("\n\n", "\n")
> rec = rec.replace("\n", "")
> rec = rec.replace(": ", ":\n")
> rec = re.sub("([a-zA-Z']+\s?[a-zA-Z]*:\n)", "\n\g<0>", rec)
> rec = rec.replace(":\n", ": ")
> return rec
>
> def clean_info( rec ):
> x = []
> for line in rec.split("\n"):
> x.append(re.sub("^([^:]+):", "\g<0> ", line))
> return "\n".join(x)
>
> def record(domain, record):
> details = ['','','','','','','','','']
> for k, v in record.items():
> try:
> details[0] = domain.lower()
> result = {
> "registrant": lambda: 1,
> "registrant name": lambda: 1,
> "registrant type": lambda: 4,
> "registrant's address": lambda: 2,
> "registrant address1": lambda: 2,
> "registrar": lambda: 3,
> "sponsoring registrar": lambda: 3,
> "registered on": lambda: 5,
> "registered": lambda: 5,
> "domain registeration date": lambda: 5,
> "renewal date": lambda: 6,
> "last updated": lambda: 7,
> "domain last updated date": lambda: 7,
> "name servers": lambda: 8,
> "name server": lambda: 8,
> "nameservers": lambda: 8,
> "updated date": lambda: 7,
> "creation date": lambda: 5,
> "expiration date": lambda: 6,
> "domain expiration date": lambda: 6,
> "administrative contact": lambda: 2
> }[k.lower()]()
> if v != '':
> details[result] = v
> except:
> continue
>
> dest.write(sep.join(details)+"\n")
>
> ## Loop through domains
> for domain in src:
>
> domain = domain.strip()
>
> if domain == '':
> continue
>
> rec = subprocess.Popen(["whois",domain],
> stdout=subprocess.PIPE).communicate()[0]
>
> if rec.startswith("No whois server") == True:
> continue
>
> if rec.startswith("This TLD has no whois server") == True:
> continue
>
> rec = trim(rec)
>
> if domain.endswith(".net"):
> rec = clean_net(rec)
>
> if domain.endswith(".com"):
> rec = clean_net(rec)
>
> if domain.endswith(".tv"):
> rec = clean_net(rec)
>
> if domain.endswith(".co.uk"):
> rec = clean_co_uk(rec)
>
> if domain.endswith(".info"):
> rec = clean_info(rec)
>
> rec = clean(rec)
>
> details = {}
>
> try:
> for line in rec.split("\n"):
> bits = line.split(': ')
> a = bits.pop(0)
> b = bits.pop(0)
> details[a.strip()] = b.strip().replace("\t", ", ")
> except:
> continue
>
> record(domain, details)
>
> ## Cleanup
> src.close()
> dest.close()
Just a few quick things before I leave work.
#!/usr/bin/env python
"""Open a file containing a list of domains (1 per line),
request and parse it's whois record and push to a csv
file.
""" # Rather use docstrings than multiline commenting like that.
def trim(txt):
x = []
for line in txt.splitlines(): # Strings have a built in function
if not line.strip() or line.startswith('WHOIS') \
or line.startswith('>>>') or line.startswith('%'):
continue # you can do them in one if statement
if line.startswith('--'): return ''.join(x)
x.append(' '+line)
return '\n'.join(x)
for domain in src:
if not domain.strip(): continue # A line with nothing is False
rec = subprocess.Popen(["whois",domain.strip()],
stdout=subprocess.PIPE).communicate()[0]
if rec.startswith('No whois server') \
or rec.startswith('This TLD has no whois server'):
continue # Startswith will return True/False so it is enough
rec = trim(rec)
if domain.endswith('.net'):
rec = clean_net(rec)
elif domain.endswith('.com'):
# Rather use if/elif statements unless somehow you think you
will match more than one.
....
for line in rec.splitlines():
try:
a, b = line.split(': ')[:2]
details[a.strip()] = b.strip().replace('\t', ', ')
except IndexError: # No matches
continue
Hope that's a start.
More information about the Python-list
mailing list