[CentralOH] Pipeness _Within_ Python
Neil Ludban
nludban at columbus.rr.com
Fri Aug 17 04:40:40 CEST 2012
On Thu, 16 Aug 2012 15:26:34 -0400
jep200404 at columbus.rr.com wrote:
> How would one do the following[1] entirely within Python[2]?
>
> python mogrify.py <(cat "$filename" \
> | tee >(sha1sum >&3) >(wc -c >&4) /dev/null \
> | gunzip | tee >(sha1sum >&5) >(wc -l -c >&6) /dev/null)
>
>
>
> [1] Calculate sha1sums and byte count of compressed and
> uncompressed file, reading file only once (without using
> temporary files and without saving file in memory).
$ cat moogrify.py | gzip > moogrify.py.gz
$ ls -l moogrify.py*
-rwxr-xr-x 1 neil wheel 1116 Aug 16 22:19 moogrify.py*
-rw-r--r-- 1 neil wheel 435 Aug 16 22:22 moogrify.py.gz
$ wc -l moogrify.py
50 moogrify.py
$ sha1 moogrify.py moogrify.py.gz
SHA1 (moogrify.py) = f8abef74e7d7de1592ea9ba888886e21befa762f
SHA1 (moogrify.py.gz) = b6e747168a6725c55b9e5abbe58d5fcb845df9e6
$ ./moogrify.py < moogrify.py.gz
Input: nBytes=435 sha1sum=aa92f0df4752c4ab2a61a1e54fa7fb3c6ff0d961
Output: nBytes=1116 nLines=50 sha1sum=f8abef74e7d7de1592ea9ba888886e21befa762f
Note the Input has wrong byte count and sha1sum, probably because the
gzip library is pre-reading the file header to determine the format.
Correctly supporting tell() and seek() on class ShaInputFile is left
as an exercise for the reader...
#!/usr/local/bin/python2.7
import gzip
import hashlib
import shutil
import sys
class ShaInputFile(object):
def __init__(self, fin=None):
self._fin = fin
self._sha = hashlib.sha1()
self.tell = self._fin.tell
self.seek = self._fin.seek
def read(self, *nbytes):
buf = self._fin.read(*nbytes)
if buf:
self._sha.update(buf)
return buf
def done(self):
print 'Input: nBytes=%i sha1sum=%s' % (
self._fin.tell(), self._sha.hexdigest() )
class ShaOutputFile(object):
def __init__(self):
self._sha = hashlib.sha1()
self._nbytes = 0
self._nlines = 0
def write(self, buf):
self._sha.update(buf)
self._nbytes += len(buf)
self._nlines += buf.count('\n')
def done(self):
print 'Output: nBytes=%i nLines=%i sha1sum=%s' % (
self._nbytes, self._nlines, self._sha.hexdigest() )
infile = ShaInputFile(sys.stdin)
xfile = gzip.GzipFile(fileobj=infile)
outfile = ShaOutputFile()
shutil.copyfileobj(xfile, outfile)
infile.done()
outfile.done()
#--#
More information about the CentralOH
mailing list