URL parsing for the hard cases

Mon Jul 23 04:10:37 EDT 2007

On 7/22/07, John Nagle wrote:
> Is there any library function that correctly tests for an IP address vs. a
> domain name based on syntax, i.e. without looking it up in DNS?

import re, string

NETLOC_RE = re.compile(r'''^ #    start of string
    (?:([^@])+@)?            # 1:
    (?:\[([0-9a-fA-F:]+)\]|  # 2: IPv6 addr
    ([^\[\]:]+))             # 3: IPv4 addr or reg-name
    (?::(\d+))?              # 4: optional port
$''', re.VERBOSE)            #    end of string

def normalize_IPv4(netloc):
    try: # Assume it's an IP; if it's not, catch the error and return None
        host = NETLOC_RE.match(netloc).group(3)
        octets = [string.atoi(o, 0) for o in host.split('.')]
        assert len(octets) <= 4
        for i in range(len(octets), 4):
            octets[i-1:] = divmod(octets[i-1], 256**(4-i))
        for o in octets: assert o < 256
        host = '.'.join(str(o) for o in octets)
    except (AssertionError, ValueError, AttributeError): return None
    return host

def is_ip(netloc):
    if normalize_IPv4(netloc) is None:
        match = NETLOC_RE.match(netloc)
        # IPv6 validation could be stricter
        if match and match.group(2): return True
        else: return False
    return True

The first function, I'd imagine, is the more interesting of the two.

-Miles