-
Notifications
You must be signed in to change notification settings - Fork 9
/
ab_haddock_format.py
200 lines (156 loc) · 6.21 KB
/
ab_haddock_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2020:
# Francesco Ambrosetti
#
"""
Formats the antibody to fit the HADDOCK requirements with the
specified chain id and returns the list of residues belonging
to the HV loops defined according to the HADDOCK friendly format.
*** The antibody has to be numbered according to the Chothia scheme ***
Usage:
python haddock-format.py <chothia numbered antibody> <output .pdb file> <chain_id>
Example:
python 4G6K_ch.pdb 4G6K-HADDOCK.pdb A
Author: {0}
Email: {1}
"""
import argparse
import biopandas.pdb as bp
import copy as cp
import os
import sys
from pathlib import Path
__author__ = "Francesco Ambrosetti"
__email__ = "[email protected]"
USAGE = __doc__.format(__author__, __email__)
def check_input():
"""
Check and collect the script inputs
Returns:
args.pdb (str): path to the pdb-file
args.chain (str): chain id to use for the HADDOCK-formatted structure
"""
# Parse command line arguments
parser = argparse.ArgumentParser(
description=USAGE,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('pdb', help='Path to the Chothia numbered antibody PDB structure', type=str)
parser.add_argument('out', help='Path to the output PDB file', type=str)
parser.add_argument('chain', help='Chain id to use for the HADDOCK-formatted PDB structure', type=str)
args = parser.parse_args()
if not os.path.isfile(args.pdb):
emsg = 'ERROR!! File {0} not found or not readable\n'.format(args.pdb)
sys.stderr.write(emsg)
sys.exit(1)
if not args.pdb.endswith(".pdb"):
emsg = 'ERROR!! File {0} not recognize as a PDB file\n'.format(args.pdb)
sys.stderr.write(emsg)
sys.exit(1)
return args.pdb, args.out, args.chain
def unique(sequence):
seen = set()
return [x for x in sequence if not (x in seen or seen.add(x))]
class AbHaddockFormat:
"""Class to renumber a Chothia antibody to make it HADDOCK-ready"""
# Loops
# L chain loops
l1 = ['26_L', '27_L', '28_L', '29_L', '30_L', '30A_L', '30B_L', '30C_L', '30D_L', '30E_L', '30F_L', '31_L', '32_L']
l2 = ['50_L', '50A_L', '50B_L', '50C_L', '50D_L', '50E_L', '50F_L', '51_L', '52_L']
l3 = ['91_L', '92_L', '93_L', '94_L', '95_L', '95A_L', '95B_L', '95C_L', '95D_L', '95E_L', '95F_L', '96_L']
loops_l = l1 + l2 + l3
# H chain loops
h1 = ['26_H', '27_H', '28_H', '29_H', '30_H', '31_H', '31A_H', '31B_H', '31C_H', '31D_H', '31E_H', '31F_H', '32_H']
h2 = ['52A_H', '52B_H', '52C_H', '52D_H', '52E_H', '52F_H', '53_H', '54_H', '55_H']
h3 = ['96_H', '97_H', '98_H', '99_H', '100_H', '100A_H', '100B_H', '100C_H', '100D_H', '100E_H', '100F_H', '100G_H',
'100H_H', '100I_H', '100J_H', '100K_H', '101_H']
loops_h = h1 + h2 + h3
def __init__(self, pdbfile, chain):
"""
Constructor for the AbHaddockFormat class
Args:
pdbfile (str): path to the antibody .pdb file
chain (str): chain id to use for the HADDOCK-ready structure
"""
self.file = pdbfile
self.pdb = bp.PandasPdb().read_pdb(self.file)
self.chain = chain
def check_chain(self):
"""
Check if the antibody contains the light and heavy chain
Returns:
0
"""
chain_ids = self.pdb.df['ATOM']['chain_id'].values
if 'H' not in chain_ids:
emsg = 'ERROR!! File {0} does not contain the heavy chain\n'.format(self.file)
sys.stderr.write(emsg)
sys.exit(1)
elif 'L' not in chain_ids:
emsg = 'ERROR!! File {0} does not contain the light chain\n'.format(self.file)
sys.stderr.write(emsg)
sys.exit(1)
return 0
def ab_format(self):
"""
Renumbers the antibody and extract the HV residues
Returns:
hv_list (list): list of the HV residue numbers
new_pdb (biopandas.pdb.pandas_pdb.PandasPdb): HADDOCK-ready pdb
"""
# Check antibody chain ids
self.check_chain()
# Modify resno to include insertions and chain id
resno = self.pdb.df['ATOM']['residue_number'].values
ins = self.pdb.df['ATOM']['insertion'].values
chain = self.pdb.df['ATOM']['chain_id'].values
ch_resno = ['{0}{1}_{2}'.format(i, j, c) for i, j, c in zip(resno, ins, chain)]
# Create new resno
count = 0
prev_resid = None
new_resno = []
# Renumber
for r in ch_resno:
if r != prev_resid:
count += 1
new_resno.append(count)
prev_resid = r
elif r == prev_resid:
new_resno.append(count)
prev_resid = r
# Update pdb
new_pdb = cp.deepcopy(self.pdb)
new_pdb.df['ATOM']['chain_id'] = self.chain
new_pdb.df['ATOM']['residue_number'] = new_resno
new_pdb.df['ATOM']['insertion'] = '' # Remove insertions
# Create dictionary with old and new numbering
resno_dict = dict(zip(unique(ch_resno), unique(new_resno)))
# Collect HV residues with the new numbering
hv_list = []
# Heavy chain
for hv_heavy in self.loops_h:
if hv_heavy in resno_dict.keys():
hv_list.append(resno_dict[hv_heavy])
# Light chain
for hv_light in self.loops_l:
if hv_light in resno_dict.keys():
hv_list.append(resno_dict[hv_light])
hv_list.sort()
return hv_list, new_pdb
def main(pdb_file: str, out_file: str, chain_id: str, active_sites_file: str = None) -> list:
# Renumber pdb file and get HV residues
pdb_format = AbHaddockFormat(pdb_file, chain_id)
hv_resno, pdb_ren = pdb_format.ab_format()
# Write pdb into a file
pdb_ren.to_pdb(path=out_file, records=['ATOM'], append_newline=True)
# Print HV residues
active_residues = ','.join(map(str, hv_resno))
print(active_residues)
if active_sites_file is not None:
Path(active_sites_file).write_text(active_residues)
return hv_resno
if __name__ == '__main__':
# Get inputs
pdb_file, out_file, chain_id = check_input()
main(pdb_file, out_file, chain_id)