[Tutor] Huge list comprehension

syed zaidi syedzaidi85 at hotmail.co.uk
Mon Jun 5 16:06:01 EDT 2017


hi,

I would appreciate if you can help me suggesting a quick and efficient strategy for comparing multiple lists with one principal list

I have about 125 lists containing about 100,000 numerical entries in each

my principal list contains about 6 million entries.

I want to compare each small list with main list and append yes/no or 0/1 in each new list corresponding to each of 125 lists


The program is working but it takes ages to process huge files,
Can someone pleases tell me how can I make this process fast. Right now it takes arounf 2 weeks to complete this task


the code I have written and is working is as under:


sample_name = []

main_op_list,principal_list = [],[]
dictionary = {}

with open("C:/Users/INVINCIBLE/Desktop/T2D_ALL_blastout_batch.txt", 'r') as f:
    reader = csv.reader(f, dialect = 'excel', delimiter='\t')
    list2 = filter(None, reader)
    for i in range(len(list2)):
        col1 = list2[i][0]
        operon = list2[i][1]
        main_op_list.append(operon)
        col1 = col1.strip().split("_")
        sample_name = col1[0]
        if dictionary.get(sample_name):
            dictionary[sample_name].append(operon)
        else:
            dictionary[sample_name] = []
            dictionary[sample_name].append(operon)
locals().update(dictionary) ## converts dictionary keys to variables
##print DLF004
dict_values = dictionary.values()
dict_keys = dictionary.keys()
print dict_keys
print len(dict_keys)
main_op_list_np = np.array(main_op_list)

DLF002_1,DLF004_1,DLF005_1,DLF006_1,DLF007_1,DLF008_1,DLF009_1,DLF010_1,DLF012_1,DLF013_1,DLF014_1,DLM001_1,DLM002_1,DLM003_1,DLM004_1,DLM005_1,DLM006_1,DLM009_1,DLM011_1,DLM012_1,DLM018_1,DOF002_1,DOF003_1 =[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
DOF004_1,DOF006_1,DOF007_1,DOF008_1,DOF009_1,DOF010_1,DOF011_1,DOF012_1,DOF013_1,DOF014_1,DOM001_1,DOM003_1,DOM005_1,DOM008_1,DOM010_1,DOM012_1,DOM013_1,DOM014_1,DOM015_1,DOM016_1,DOM017_1,DOM018_1,DOM019_1 =[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
DOM020_1,DOM021_1,DOM022_1,DOM023_1,DOM024_1,DOM025_1,DOM026_1 = [],[],[],[],[],[],[]
NLF001_1,NLF002_1,NLF005_1,NLF006_1,NLF007_1,NLF008_1,NLF009_1,NLF010_1,NLF011_1,NLF012_1,NLF013_1,NLF014_1,NLF015_1,NLM001_1,NLM002_1,NLM003_1,NLM004_1,NLM005_1,NLM006_1,NLM007_1,NLM008_1,NLM009_1,NLM010_1 =[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
NLM015_1,NLM016_1,NLM017_1,NLM021_1,NLM022_1,NLM023_1,NLM024_1,NLM025_1,NLM026_1,NLM027_1,NLM028_1,NLM029_1,NLM031_1,NLM032_1,NOF001_1,NOF002_1,NOF004_1,NOF005_1,NOF006_1,NOF007_1,NOF008_1,NOF009_1,NOF010_1 =[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
NOF011_1,NOF012_1,NOF013_1,NOF014_1,NOM001_1,NOM002_1,NOM004_1,NOM005_1,NOM007_1,NOM008_1,NOM009_1,NOM010_1,NOM012_1,NOM013_1,NOM015_1,NOM016_1,NOM017_1,NOM018_1,NOM019_1,NOM020_1,NOM022_1,NOM023_1,NOM025_1 =[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
NOM026_1,NOM027_1,NOM028_1,NOM029_1 = [],[],[],[]


for i in main_op_list_np:
    if i in DLF002: DLF002_1.append('1')
    else:DLF002_1.append('0')
    if i in DLF004: DLF004_1.append('1')
    else:DLF004_1.append('0')
    if i in DLF005: DLF005_1.append('1')
    else:DLF005_1.append('0')
    if i in DLF006: DLF006_1.append('1')
    else:DLF006_1.append('0')
    if i in DLF007: DLF007_1.append('1')
    else:DLF007_1.append('0')
    if i in DLF008: DLF008_1.append('1')
    else:DLF008_1.append('0')
##   if main_op_list[i] in DLF009: DLF009_1.append('1')
 ##   else:DLF009_1.append('0')
    if i in DLF010: DLF010_1.append('1')
    else:DLF010_1.append('0')
    if i in DLF012: DLF012_1.append('1')
    else:DLF012_1.append('0')
    if i in DLF013: DLF013_1.append('1')
    else:DLF013_1.append('0')
    if i in DLF014: DLF014_1.append('1')
    else:DLF014_1.append('0')
    if i in DLM001: DLM001_1.append('1')
    else:DLM001_1.append('0')
    if i in DLM002: DLM002_1.append('1')
    else:DLM002_1.append('0')
    if i in DLM003: DLM003_1.append('1')
    else:DLM003_1.append('0')
    if i in DLM004: DLM004_1.append('1')
    else:DLM004_1.append('0')
    if i in DLM005: DLM005_1.append('1')
    else:DLM005_1.append('0')
    if i in DLM006: DLM006_1.append('1')
    else:DLM006_1.append('0')
    if i in DLM009: DLM009_1.append('1')
    else:DLM009_1.append('0')
    if i in DLM011: DLM011_1.append('1')
    else:DLM011_1.append('0')
    if i in DLM012: DLM012_1.append('1')
    else:DLM012_1.append('0')
    if i in DLM018: DLM018_1.append('1')
    else:DLM018_1.append('0')
    if i in DOF002: DOF002_1.append('1')
    else:DOF002_1.append('0')
    if i in DOF003: DOF003_1.append('1')
    else:DOF003_1.append('0')
    if i in DOF004: DOF004_1.append('1')
    else:DOF004_1.append('0')
    if i in DOF006: DOF006_1.append('1')
    else:DOF006_1.append('0')
    if i in DOF007: DOF007_1.append('1')
    else:DOF007_1.append('0')
    if i in DOF008: DOF008_1.append('1')
    else:DOF008_1.append('0')
    if i in DOF009: DOF009_1.append('1')
    else:DOF009_1.append('0')
    if i in DOF010: DOF010_1.append('1')
    else:DOF010_1.append('0')
    if i in DOF011: DOF011_1.append('1')
    else:DOF011_1.append('0')
    if i in DOF012: DOF012_1.append('1')
    else:DOF012_1.append('0')
    if i in DOF013: DOF013_1.append('1')
    else:DOF013_1.append('0')
    if i in DOF014: DOF014_1.append('1')
    else:DOF014_1.append('0')
    if i in DOM001: DOM001_1.append('1')
    else:DOM001_1.append('0')
    if i in DOM003: DOM003_1.append('1')
    else:DOM003_1.append('0')
    if i in DOM005: DOM005_1.append('1')
    else:DOM005_1.append('0')
    if i in DOM008: DOM008_1.append('1')
    else:DOM008_1.append('0')
    if i in DOM010: DOM010_1.append('1')
    else:DOM010_1.append('0')
    if i in DOM012: DOM012_1.append('1')
    else:DOM012_1.append('0')
    if i in DOM013: DOM013_1.append('1')
    else:DOM013_1.append('0')
    if i in DOM014: DOM014_1.append('1')
    else:DOM014_1.append('0')
    if i in DOM015: DOM015_1.append('1')
    else:DOM015_1.append('0')
    if i in DOM016: DOM016_1.append('1')
    else:DOM016_1.append('0')
    if i in DOM017: DOM017_1.append('1')
    else:DOM017_1.append('0')
    if i in DOM018: DOM018_1.append('1')
    else:DOM018_1.append('0')
    if i in DOM019: DOM019_1.append('1')
    else:DOM019_1.append('0')
    if i in DOM020: DOM020_1.append('1')
    else:DOM020_1.append('0')
    if i in DOM021: DOM021_1.append('1')
    else:DOM021_1.append('0')
    if i in DOM022: DOM022_1.append('1')
    else:DOM022_1.append('0')
    if i in DOM023: DOM023_1.append('1')
    else:DOM023_1.append('0')
    if i in DOM024: DOM024_1.append('1')
    else:DOM024_1.append('0')
    if i in DOM025: DOM025_1.append('1')
    else:DOM025_1.append('0')
    if i in DOM026: DOM026_1.append('1')
    else:DOM026_1.append('0')
    if i in NLF001: NLF001_1.append('        |        1')
    else:NLF001_1.append('        |        0')
    if i in NLF002: NLF002_1.append('1')
    else:NLF002_1.append('0')
    if i in NLF005: NLF005_1.append('1')
    else:NLF005_1.append('0')
    if i in NLF006: NLF006_1.append('1')
    else:NLF006_1.append('0')
    if i in NLF007: NLF007_1.append('1')
    else:NLF007_1.append('0')
    if i in NLF008: NLF008_1.append('1')
    else:NLF008_1.append('0')
    if i in NLF009: NLF009_1.append('1')
    else:NLF009_1.append('0')
    if i in NLF010: NLF010_1.append('1')
    else:NLF010_1.append('0')
    if i in NLF011: NLF011_1.append('1')
    else:NLF011_1.append('0')
    if i in NLF012: NLF012_1.append('1')
    else:NLF012_1.append('0')
    if i in NLF013: NLF013_1.append('1')
    else:NLF013_1.append('0')
    if i in NLF014: NLF014_1.append('1')
    else:NLF014_1.append('0')
    if i in NLF015: NLF015_1.append('1')
    else:NLF015_1.append('0')
    if i in NLM001: NLM001_1.append('1')
    else:NLM001_1.append('0')
    if i in NLM002: NLM002_1.append('1')
    else:NLM002_1.append('0')
    if i in NLM003: NLM003_1.append('1')
    else:NLM003_1.append('0')
    if i in NLM004: NLM004_1.append('1')
    else:NLM004_1.append('0')
    if i in NLM005: NLM005_1.append('1')
    else:NLM005_1.append('0')
    if i in NLM006: NLM006_1.append('1')
    else:NLM006_1.append('0')
    if i in NLM007: NLM007_1.append('1')
    else:NLM007_1.append('0')
    if i in NLM008: NLM008_1.append('1')
    else:NLM008_1.append('0')
    if i in NLM009: NLM009_1.append('1')
    else:NLM009_1.append('0')
    if i in NLM010: NLM010_1.append('1')
    else:NLM010_1.append('0')
    if i in NLM015: NLM015_1.append('1')
    else:NLM015_1.append('0')
    if i in NLM016: NLM016_1.append('1')
    else:NLM016_1.append('0')
    if i in NLM017: NLM017_1.append('1')
    else:NLM017_1.append('0')
    if i in NLM021: NLM021_1.append('1')
    else:NLM021_1.append('0')
    if i in NLM022: NLM022_1.append('1')
    else:NLM022_1.append('0')
    if i in NLM023: NLM023_1.append('1')
    else:NLM023_1.append('0')
    if i in NLM024: NLM024_1.append('1')
    else:NLM024_1.append('0')
    if i in NLM025: NLM025_1.append('1')
    else:NLM025_1.append('0')
    if i in NLM026: NLM026_1.append('1')
    else:NLM026_1.append('0')
    if i in NLM027: NLM027_1.append('1')
    else:NLM027_1.append('0')
    if i in NLM028: NLM028_1.append('1')
    else:NLM028_1.append('0')
    if i in NLM029: NLM029_1.append('1')
    else:NLM029_1.append('0')
    if i in NLM031: NLM031_1.append('1')
    else:NLM031_1.append('0')
    if i in NLM032: NLM032_1.append('1')
    else:NLM032_1.append('0')
    if i in NOF001: NOF001_1.append('1')
    else:NOF001_1.append('0')
    if i in NOF002: NOF002_1.append('1')
    else:NOF002_1.append('0')
    if i in NOF004: NOF004_1.append('1')
    else:NOF004_1.append('0')
    if i in NOF005: NOF005_1.append('1')
    else:NOF005_1.append('0')
    if i in NOF006: NOF006_1.append('1')
    else:NOF006_1.append('0')
    if i in NOF007: NOF007_1.append('1')
    else:NOF007_1.append('0')
    if i in NOF008: NOF008_1.append('1')
    else:NOF008_1.append('0')
    if i in NOF009: NOF009_1.append('1')
    else:NOF009_1.append('0')
    if i in NOF010: NOF010_1.append('1')
    else:NOF010_1.append('0')
    if i in NOF011: NOF011_1.append('1')
    else:NOF011_1.append('0')
    if i in NOF012: NOF012_1.append('1')
    else:NOF012_1.append('0')
    if i in NOF013: NOF013_1.append('1')
    else:NOF013_1.append('0')
    if i in NOF014: NOF014_1.append('1')
    else:NOF014_1.append('0')
    if i in NOM001: NOM001_1.append('1')
    else:NOM001_1.append('0')
    if i in NOM002: NOM002_1.append('1')
    else:NOM002_1.append('0')
    if i in NOM004: NOM004_1.append('1')
    else:NOM004_1.append('0')
    if i in NOM005: NOM005_1.append('1')
    else:NOM005_1.append('0')
    if i in NOM007: NOM007_1.append('1')
    else:NOM007_1.append('0')
    if i in NOM008: NOM008_1.append('1')
    else:NOM008_1.append('0')
    if i in NOM009: NOM009_1.append('1')
    else:NOM009_1.append('0')
    if i in NOM010: NOM010_1.append('1')
    else:NOM010_1.append('0')
    if i in NOM012: NOM012_1.append('1')
    else:NOM012_1.append('0')
    if i in NOM013: NOM013_1.append('1')
    else:NOM013_1.append('0')
    if i in NOM015: NOM015_1.append('1')
    else:NOM015_1.append('0')
    if i in NOM016: NOM016_1.append('1')
    else:NOM016_1.append('0')
    if i in NOM017: NOM017_1.append('1')
    else:NOM017_1.append('0')
    if i in NOM018: NOM018_1.append('1')
    else:NOM018_1.append('0')
    if i in NOM019: NOM019_1.append('1')
    else:NOM019_1.append('0')
    if i in NOM020: NOM020_1.append('1')
    else:NOM020_1.append('0')
    if i in NOM022: NOM022_1.append('1')
    else:NOM022_1.append('0')
    if i in NOM023: NOM023_1.append('1')
    else:NOM023_1.append('0')
    if i in NOM025: NOM025_1.append('1')
    else:NOM025_1.append('0')
    if i in NOM026: NOM026_1.append('1')
    else:NOM026_1.append('0')
    if i in NOM027: NOM027_1.append('1')
    else:NOM027_1.append('0')
    if i in NOM028: NOM028_1.append('1')
    else:NOM028_1.append('0')
    if i in NOM029: NOM029_1.append('1')
    else:NOM029_1.append('0')



##
print 'saving'
zoo = zip(main_op_list, DLF002_1,DLF004_1,DLF005_1,DLF006_1,DLF007_1,DLF008_1,DLF009_1,DLF010_1,DLF012_1,DLF013_1,DLF014_1,DLM001_1,DLM002_1,DLM003_1,DLM004_1,DLM005_1,DLM006_1,DLM009_1,DLM011_1,DLM012_1,DLM018_1,DOF002_1,DOF003_1,DOF004_1,DOF006_1,DOF007_1,DOF008_1,DOF009_1,DOF010_1,DOF011_1,DOF012_1,DOF013_1,DOF014_1,DOM001_1,DOM003_1,DOM005_1,DOM008_1,DOM010_1,DOM012_1,DOM013_1,DOM014_1,DOM015_1,DOM016_1,DOM017_1,DOM018_1,DOM019_1,DOM020_1,DOM021_1,DOM022_1,DOM023_1,DOM024_1,DOM025_1,DOM026_1,NLF001_1,NLF002_1,NLF005_1,NLF006_1,NLF007_1,NLF008_1,NLF009_1,NLF010_1,NLF011_1,NLF012_1,NLF013_1,NLF014_1,NLF015_1,NLM001_1,NLM002_1,NLM003_1,NLM004_1,NLM005_1,NLM006_1,NLM007_1,NLM008_1,NLM009_1,NLM010_1,NLM015_1,NLM016_1,NLM017_1,NLM021_1,NLM022_1,NLM023_1,NLM024_1,NLM025_1,NLM026_1,NLM027_1,NLM028_1,NLM029_1,NLM031_1,NLM032_1,NOF001_1,NOF002_1,NOF004_1,NOF005_1,NOF006_1,NOF007_1,NOF008_1,NOF009_1,NOF010_1,NOF011_1,NOF012_1,NOF013_1,NOF014_1,NOM001_1,NOM002_1,NOM004_1,NOM005_1,NOM007_1,NOM008_1,NOM009_1,NOM010_1,NOM012_1,NOM013_1,NOM015_1,NOM016_1,NOM017_1,NOM018_1,NOM019_1,NOM020_1,NOM022_1,NOM023_1,NOM025_1,NOM026_1,NOM027_1,NOM028_1,NOM029_1)
with open("test.tab", 'w+') as outfile:
    writer =csv.writer(outfile, delimiter = '\t', lineterminator = '\n')
    writer.writerow(['           ','DLF2','DLF4','DLF5','DLF6','DLF7','DLF8','DLF9','DLF10','DLF12','DLF13','DLF14','DLM1','DLM2','DLM3','DLM4','DLM5','DLM6','DLM9','DLM11','DLM12','DLM18','DOF2','DOF3','DOF4','DOF6','DOF7','DOF8','DOF9','DOF10','DOF11','DOF12','DOF13','DOF04','DOM1','DOM3','DOM5','DOM8','DOM10','DOM12','DOM13','DOM14','DOM15','DOM16','DOM17','DOM18','DOM19','DOM20','DOM21','DOM22','DOM23','DOM24','DOM25','DOM26','NLF1','NLF2','NLF5','NLF6','NLF7','NLF8','NLF9','NLF10','NLF11','NLF12','NLF13','NLF14','NLF15','NLM1','NLM2','NLM3','NLM4','NLM5','NLM6','NLM7','NLM8','NLM9','NLM10','NLM15','NLM16','NLM17','NLM21','NLM22','NLM23','NLM24','NLM25','NLM26','NLM27','NLM28','NLM29','NLM31','NLM32','NOF1','NOF2','NOF4','NOF5','NOF6','NOF7','NOF8','NOF9','NOF10','NOF11','NOF12','NOF13','NOF14','NOM1','NOM2','NOM4','NOM5','NOM7','NOM8','NOM9','NOM10','NOM12','NOM13','NOM15','NOM16','NOM17','NOM18','NOM19','NOM20','NOM22','NOM23','NOM25','NOM26','NOM27','NOM28','NOM29'])
    writer.writerows(zoo)
outfile.close()
print 'done'
end_time = time.time()
elapsed = end_time-start_time
print "Time elapsed.", elapsed


Thanks

Best Regards


Syed Shujaat Ali Zaidi
PhD Scholar (Bioinformatics)
MOE Key Laboratory of Bioinformatics
Bioinformatics Division, TNLIST & Department of Automation
FIT 1-107, Tsinghua University, Beijing 100084, China

Lecturer (Bioinformatics)
Department of Bio Sciences
COMSATS Institute of Information Technology
Islamabad, Pakistan


More information about the Tutor mailing list