sobering observation, python vs. perl

Fri Mar 18 05:26:34 EDT 2016

Charles T. Smith wrote:

> On Thu, 17 Mar 2016 10:52:30 -0500, Tim Chase wrote:
> 
>>> Not saying this will make a great deal of difference, but these two
>> items jumped out at me.  I'd even be tempted to just use string
>> manipulations for the isready aspect as well.  Something like
>> (untested)
> 
> well, I don't want to forgo REs in order to have python's numbers be
> better....

As has been said, for simple text processing tasks string methods are the 
preferred approach in Python. I think this is more for clarity than 
performance.

If you need regular expressions a simple way to boost performance may be to 
use the external regex module.

(By the way, if you are looking for a simple way to iterate over multiple 
files use
for line in fileinput.input():
    ...
)

Some numbers:

$ time perl find.pl data/sample*.txt > r1.txt

real    0m0.504s
user    0m0.466s
sys     0m0.036s
$ time python find.py data/sample*.txt > r2.txt

real    0m2.403s
user    0m2.339s
sys     0m0.059s
$ time python find_regex.py data/sample*.txt > r3.txt

real    0m0.693s
user    0m0.631s
sys     0m0.060s
$ time python find_no_re.py data/sample*.txt > r4.txt

real    0m0.319s
user    0m0.267s
sys     0m0.048s

Python 3 slows down things:

$ time python3 find_no_re.py data/sample*.txt > r5.txt

real    0m0.497s
user    0m0.444s
sys     0m0.051s

The scripts:
$ cat find.pl
#!/usr/bin/env perl

while (<>) {
    if (/(.*) is ready/) {
        $tn = $1;
    }
    elsif (/release_req/) {
        print "$tn\n";
    }
}
$ cat find.py
#!/usr/bin/env python
import sys
import re

def main():
    isready = re.compile ("(.*) is ready").match
    relreq = re.compile (".*release_req").match

    tn = ""
    for fn in sys.argv[1:]:
        with open(fn) as fd:
            for line in fd:
                match = isready(line)
                if match:
                    tn = match.group(1)
                elif relreq(line):
                    print(tn)

main()

$ cat find_regex.py
#!/usr/bin/env python
import sys
import regex as re
[rest the same as find.py]

$ cat find_no_re.py
#!/usr/bin/env python
import sys

def main():
    tn = ""
    for fn in sys.argv[1:]:
        with open(fn) as fd:
            for line in fd:
                if " is ready" in line:
                    tn = line.partition(" is ready")[0]
                elif "release_req" in line:
                    print(tn)

main()

The test data was generated with

$ cat make_test_data.py
#!/usr/bin/env python3
import os
import random
import shutil

from itertools import islice

def make_line_factory(words, line_length, isready):
    choice = random.choice

    def make_line():
        while True:
            line = [choice(words)]
            length = len(line[0])
            while length < line_length:
                word = choice(words)
                line.append(word)
                length += len(word) + 1
            if random.randrange(100) < isready:
                pos = random.randrange(len(line))
                line[pos:pos+1] = ["is", "ready"]
            elif random.randrange(100) < isready:
                pos = random.randrange(len(line))
                line[pos:pos] = ["release_req"]
            yield " ".join(line)

    return make_line

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--words", default="/usr/share/dict/words")
    parser.add_argument("--line-length", type=int, default=80)
    parser.add_argument("--num-lines", type=eval, default=10**5)
    parser.add_argument("--num-files", type=int, default=4)
    parser.add_argument("--name-template", default="sample{:0{}}.txt")
    parser.add_argument("--data-folder", default="data")
    parser.add_argument("--remove-data-folder", action="store_true")
    parser.add_argument("--first-match-percent", type=int, default=10)
    try:
        import argcomplete
    except ImportError:
        pass
    else:
        argcomplete.autocomplete(parser)

    args = parser.parse_args()

    if args.remove_data_folder:
        shutil.rmtree(args.data_folder)
    os.mkdir(args.data_folder)

    with open(args.words) as f:
        words = [line.strip() for line in f]

    make_line = make_line_factory(
        words, args.line_length, args.first_match_percent)()

    width = len(str(args.num_files))
    for index in range(1, args.num_files+1):
        filename = os.path.join(
            args.data_folder,
            args.name_template.format(index, width))
        print(filename)
        with open(filename, "w") as f:
            for line in islice(make_line, args.num_lines):
                print(line, file=f)

if __name__ == "__main__":
    main()