#!/usr/bin/env python

import io
import os.path
import sys

# Download and unzip http://www.gutenberg.org/files/3200/old/mtent12.zip
TESTFILE = 'mtent12.txt'

# Tested engines are:
# * re, standard regular expression module
# * regex, alternative regular expression module: https://pypi.python.org/pypi/regex/
# * re2, Python wrapper for Google's RE2: https://pypi.python.org/pypi/re2/
# * pcre, Python PCRE bindings: https://pypi.python.org/pypi/python-pcre/
engines = 're', 'regex', 're2', 'pcre'

tests = (
    r'Twain',
    r'(?i)Twain',
    r'[a-z]shing',
    r'Huck[a-zA-Z]+|Saw[a-zA-Z]+',
    r'\b\w+nn\b',
    r'[a-q][^u-z]{13}x',
    r'Tom|Sawyer|Huckleberry|Finn',
    r'(?i)Tom|Sawyer|Huckleberry|Finn',
    r'.{0,2}(Tom|Sawyer|Huckleberry|Finn)',
    r'.{2,4}(Tom|Sawyer|Huckleberry|Finn)',
    r'Tom.{10,25}river|river.{10,25}Tom',
    r'[a-zA-Z]+ing',
    r'\s[a-zA-Z]{0,12}ing\s',
    r'([A-Za-z]awyer|[A-Za-z]inn)\s',
    r'''["'][^"']{0,30}[?!\.]["']''',
)


class strfinder:
    def __init__(self, s):
        self.sub = s

    def findall(self, text):
        sub = self.sub
        res = []
        append = res.append
        find = text.find
        size = len(sub)
        i = 0
        while True:
            i = find(sub, i)
            if i < 0:
                break
            append(sub)
            i += size
        return res


def test_regex_twain(p, text, timer, iterations, maxtime=10):
    # Warm up.
    count = len(p.findall(text))
    times = []
    ts = 0
    for i in range(iterations):
        t0 = timer()
        res = p.findall(text)
        t = timer() - t0
        assert len(res) == count
        times.append(t)
        ts += 1
        if ts > maxtime:
            break
    return count, min(times)

if __name__ == '__main__':
    try:
        from time import perf_counter as timer
    except ImportError:
        from time import time as timer

    # A workaround for re2 bug
    try:
        import builtins
    except ImportError:
        pass
    else:
        builtins.basestring = str

    renames = []
    remods = []
    for name in engines:
        try:
            mod = __import__(name)
        except (ImportError, NameError):
            pass
        else:
            renames.append(name)
            remods.append(mod)

    basedir = os.path.dirname(__file__)
    with io.open(os.path.join(basedir, TESTFILE), encoding='latin1') as f:
        text = f.read()
    assert len(text) == 19665221
    text = text[6000000:8000000]

    sys.stdout.write('%-35s' % '')
    sys.stdout.write(' %5s ' % '')
    for name in renames:
        sys.stdout.write(' %6s' % name)
    sys.stdout.write(' %6s' % 'str.find')
    sys.stdout.write('\n\n')

    for f in tests:
        sys.stdout.write('%-35s' % f)
        sys.stdout.flush()
        count = None
        for mod in remods:
            c, t = test_regex_twain(mod.compile(f), text, timer, 5)
            if count is None:
                count = c
                sys.stdout.write(' %5d ' % count)
            else:
                assert c == count
            sys.stdout.write(' %6.4g' % (t * 1000))
            sys.stdout.flush()
        if f == 'Twain':
            c, t = test_regex_twain(strfinder(f), text, timer, 5)
            assert c == count
            sys.stdout.write(' %6.4g' % (t * 1000))
            sys.stdout.flush()
        sys.stdout.write('\n')
        sys.stdout.flush()
