-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_PSSM.py
executable file
·115 lines (81 loc) · 3.48 KB
/
make_PSSM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import sys
import gzip
import argparse
import pickle
import pandas as pd
from matrix2 import buildPSSM_alphabet
dir_output_sPSSMs = 'outputs/sPSSMs'
def mapAligns(gapped_fam_dict, alphamap):
"""convert alignments in the new alphabet"""
for RF in gapped_fam_dict:
# print(RF)
for ID in gapped_fam_dict[RF]:
if gapped_fam_dict[RF][ID].get('bear'):
gapped_fam_dict[RF][ID]['alpha'] = "".join(
[alphamap[ch] for ch in gapped_fam_dict.get(RF).get(ID).get('bear')]
)
mbrVersion = sys.argv[1]
MBR = sys.argv[2]
ALPHAMAP = sys.argv[3]
GAPFAMDICT = sys.argv[4]
ignore_these_families = [
#'RF00210', 'RF01879', #RF is missing
#'RF02767', 'RF02768', 'RF02770', 'RF02773', 'RF02775', 'RF02781', 'RF02783'
]
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--mbr", help="the substitution matrix to test")
parser.add_argument("-a", "--alpha", help="pickle of alphabet dictionary (must correspond to the alphabet \
used with the substitution matrix). With respect to standard BEAR")
parser.add_argument("-gfd", "--gapfamdict", help="the rfam alignments with gaps and bear pickle (gzipped)")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args(args=['--alpha', ALPHAMAP, '-v', '-m', MBR, '-gfd', GAPFAMDICT])
with gzip.open(args.gapfamdict, 'rb') as afile:
gapped_fam_dict = pickle.load(afile)
# Prepare the alphabets, adding the gap symbol
f = open(args.alpha).readlines()
alph_bear = {x.split()[0]: x.split()[1] for x in f}
alph_bear['-'] = '-'
alphamaps = [{key: alph_bear[KEY] for key in KEY} for KEY in alph_bear]
alphamap = {}
for ap in alphamaps:
alphamap.update(ap)
mbr = pd.read_table(args.mbr, index_col=0)
mapAligns(gapped_fam_dict, alphamap)
rfams = {}
num_families_div_10 = len(gapped_fam_dict) // 10
for num_fam, RF in enumerate(gapped_fam_dict):
# if num_fam % num_families_div_10 == 0:
# print('{}% ({}/{})'.format((num_fam // num_families_div_10) * 10, num_fam, len(gapped_fam_dict)))
if RF not in ignore_these_families:
rfams[RF] = [
[gapped_fam_dict[RF][ID]['sequence'] for ID in gapped_fam_dict[RF]],
[gapped_fam_dict[RF][ID]['alpha'] for ID in gapped_fam_dict[RF]]
]
PSSMs_alpha = []
rfam_list = []
num_families_div_20 = len(rfams) // 20
# use RFAMS
print('Build PSSM: STARTED')
for num_fam, rfam in enumerate(rfams):
if num_fam % num_families_div_20 == 0:
print('{:.2f}% ({}/{})'.format((num_fam / len(rfams)) * 100.0, num_fam, len(rfams)))
rfam_list.append(rfam)
PSSM_b = buildPSSM_alphabet(rfams[rfam], mbr)
PSSMs_alpha.append(PSSM_b)
print('Build PSSM: DONE!')
dir_output_sPSSMs_name_id = os.path.join(dir_output_sPSSMs, mbrVersion)
if not os.path.exists(dir_output_sPSSMs_name_id):
os.makedirs(dir_output_sPSSMs_name_id)
pickle_path = os.path.join(dir_output_sPSSMs_name_id, 'rfam_PSSM_{}.pickle.gz'.format(mbrVersion))
with gzip.open(pickle_path, 'wb') as handle:
pickle.dump(PSSMs_alpha, handle)
print('File {} created.'.format(pickle_path))
dic_PSSM = {}
for i in range(0, len(rfam_list)):
dic_PSSM[rfam_list[i]] = PSSMs_alpha[i]
dic_pickle_path = os.path.join(dir_output_sPSSMs_name_id, 'rfam_PSSM_dic_{}.pickle.gz'.format(mbrVersion))
with gzip.open(dic_pickle_path, 'wb') as handle:
pickle.dump(dic_PSSM, handle)
print('File {} created.'.format(dic_pickle_path))