[pypy-issue] Issue #2271: Decompressing takes significantly longer than on CPython (pypy/pypy)
John Longinotto
issues-reply at bitbucket.org
Thu Apr 7 08:04:53 EDT 2016
New issue 2271: Decompressing takes significantly longer than on CPython
https://bitbucket.org/pypy/pypy/issues/2271/decompressing-takes-significantly-longer
John Longinotto:
The following code takes about 9.5 minutes on PyPy, but only 42 seconds on CPython:
```
#!python
import sys
import zlib
import struct
def bgzip(data,blocks_at_a_time=1):
if type(data) == str: d = open(data,'rb')
else: d = data
cache = ''
bytes_read = 0
magic = d.read(4)
blocks_left_to_grab = blocks_at_a_time
while magic:
if not magic: break # a child's heart
bytes_read += 4
if magic != "\x1f\x8b\x08\x04": print "ERROR: The input file is not in a format I understand :("; exit()
header_data = magic + d.read(8)
header_size = 12
extra_len = struct.unpack("<H", header_data[-2:])[0]
while header_size-12 < extra_len:
header_data += d.read(4)
bytes_read += 4
subfield_id = header_data[-4:-2]
subfield_len = struct.unpack("<H", header_data[-2:])[0]
subfield_data = d.read(subfield_len); bytes_read += subfield_len
header_data += subfield_data
header_size += subfield_len + 4
if subfield_id == 'BC':
block_size = struct.unpack("<H", subfield_data)[0]
raw_data = d.read(block_size - extra_len - 19); bytes_read += (block_size-extra_len-19)
crc_data = d.read(8); bytes_read += 8
zipped_data = header_data + raw_data + crc_data
unzipped_data = zlib.decompress(zipped_data,31) # Could parallize this in a worker poolchen
expected_crc = crc_data[:4]
expected_size = struct.unpack("<I", crc_data[4:])[0]
if len(unzipped_data) != expected_size: print 'ERROR: Failed to unpack due to a Type 1 CRC error. Could the BAM be corrupted?'; exit()
crc = zlib.crc32(unzipped_data)
if crc < 0: crc = struct.pack("<i", crc)
else: crc = struct.pack("<I", crc)
if expected_crc != crc: print 'ERROR: Failed to unpack due to a Type 2 CRC error. Could the BAM be corrupted?'; exit()
magic = d.read(4)
if len(unzipped_data) > 0:
cache += unzipped_data
blocks_left_to_grab -= 1
if blocks_left_to_grab == 0:
yield cache
cache = ''
blocks_left_to_grab = blocks_at_a_time
if cache != '': yield cache
d.close()
data_generator = bgzip(sys.argv[-1],blocks_at_a_time=300)
for block in data_generator: pass
```
Run via: the_code.py ./ENCFF001LCU.bam
The input file ENCFF001LCU.bam can be downloaded from https://www.encodeproject.org/files/ENCFF001LCU/@@download/ENCFF001LCU.bam
More information about the pypy-issue
mailing list