[pypy-issue] Issue #2271: Decompressing takes significantly longer than on CPython (pypy/pypy)

John Longinotto issues-reply at bitbucket.org
Thu Apr 7 08:04:53 EDT 2016


New issue 2271: Decompressing takes significantly longer than on CPython
https://bitbucket.org/pypy/pypy/issues/2271/decompressing-takes-significantly-longer

John Longinotto:

The following code takes about 9.5 minutes on PyPy, but only 42 seconds on CPython:


```
#!python
import sys
import zlib
import struct

def bgzip(data,blocks_at_a_time=1):
    if type(data) == str: d = open(data,'rb')
    else: d = data
    cache = ''
    bytes_read = 0
    magic = d.read(4)
    blocks_left_to_grab = blocks_at_a_time
    while magic:
        if not magic: break # a child's heart
        bytes_read += 4
        if magic != "\x1f\x8b\x08\x04": print "ERROR: The input file is not in a format I understand :("; exit()
        header_data = magic + d.read(8)
        header_size = 12
        extra_len = struct.unpack("<H", header_data[-2:])[0]
        while header_size-12 < extra_len:
            header_data += d.read(4)
            bytes_read += 4 
            subfield_id = header_data[-4:-2]
            subfield_len = struct.unpack("<H", header_data[-2:])[0]
            subfield_data = d.read(subfield_len); bytes_read += subfield_len
            header_data += subfield_data
            header_size += subfield_len + 4
            if subfield_id == 'BC':
                block_size = struct.unpack("<H", subfield_data)[0]
        raw_data = d.read(block_size - extra_len - 19); bytes_read += (block_size-extra_len-19)
        crc_data = d.read(8); bytes_read += 8
        zipped_data = header_data + raw_data + crc_data
        unzipped_data = zlib.decompress(zipped_data,31) # Could parallize this in a worker poolchen
        expected_crc = crc_data[:4]
        expected_size = struct.unpack("<I", crc_data[4:])[0]
        if len(unzipped_data) != expected_size: print 'ERROR: Failed to unpack due to a Type 1 CRC error. Could the BAM be corrupted?'; exit()
        crc = zlib.crc32(unzipped_data)
        if crc < 0: crc = struct.pack("<i", crc)
        else:       crc = struct.pack("<I", crc)
        if expected_crc != crc: print 'ERROR: Failed to unpack due to a Type 2 CRC error. Could the BAM be corrupted?'; exit()
        magic = d.read(4)

        if len(unzipped_data) > 0:
            cache += unzipped_data
            blocks_left_to_grab -= 1
        if blocks_left_to_grab == 0:
            yield cache
            cache = ''
            blocks_left_to_grab = blocks_at_a_time

    if cache != '': yield cache
    d.close()

data_generator = bgzip(sys.argv[-1],blocks_at_a_time=300)
for block in data_generator: pass
```

Run via: the_code.py ./ENCFF001LCU.bam

The input file ENCFF001LCU.bam can be downloaded from https://www.encodeproject.org/files/ENCFF001LCU/@@download/ENCFF001LCU.bam




More information about the pypy-issue mailing list