-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseDADA2Tab
166 lines (154 loc) · 6.01 KB
/
ParseDADA2Tab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
# This is the fourth script in the PopMLST pipeline. Run it
# without arguments to get the usage. Please provide
# proper attribution if you use this code.
#
# https://github.com/marade/PopMLST
#
# author: M Radey (email: marad_at_uw.edu)
import sys, os, argparse, re, tempfile, glob
import pandas as pd
from shutil import which
from subprocess import call, Popen, PIPE, STDOUT
from operator import itemgetter
from Bio import SeqIO
from colorama import init as cinit
from colorama import Fore, Back, Style
cinit(autoreset=True)
version = "1.0.0"
def do_args():
desc = "A tool for parsing ASV files from DADA2"
parser = argparse.ArgumentParser(prog=os.path.basename(__file__),\
description=desc)
parser.add_argument("infile", help="specifies the file with " +\
"with the DADA2 ASV output")
parser.add_argument("refdir", help="specifies the directory with " +\
"the reference alleles to match with the ASVs")
parser.add_argument("outfile", help="specifies the file where " +\
"you want the output to go")
parser.add_argument("blastout", help="specifies the file where " +\
"you want the Blast output to go")
parser.add_argument("-f", "--filter", action="store_true",\
help="filter no_match columns in the output")
return parser.parse_args()
def make_ref_file(refdir):
print(Fore.CYAN + Style.BRIGHT + "Scanning reference...")
rfile = tempfile.NamedTemporaryFile('w+t', delete=False)
reffas = glob.glob(refdir + "/*.fa*")
with open(rfile.name, 'w') as o:
for reffa in reffas:
with open(reffa) as n:
for line in n: o.write(line)
reclen = {}
for rec in SeqIO.parse(rfile, 'fasta'): reclen[rec.id] = len(str(rec.seq))
return reclen, rfile.name
def type_asvs(infile, ofile, reclen, rfile):
print(Fore.CYAN + Style.BRIGHT + "Typing ASVs...")
blastn = which("blastn")
if blastn == None:
print(Fore.RED + "ERROR: Could not find blastn program.")
sys.exit()
n = open(infile)
asvdict = {}
asvorder = []
blastres = []
# get the order of ASVs from DADA2 results
for asvseq in n.readline().rstrip().split('\t'):
if asvseq == "": continue
asvseq = re.sub('"', '', asvseq)
#print asvseq
asvdict[asvseq] = {}
asvorder.append(asvseq)
n.close()
# Blast each ASV against all alleles
for asvseq in asvdict.keys():
tfile = tempfile.NamedTemporaryFile('w+t', delete=False)
tfile.write(">test\n" + asvseq + "\n")
tfile.close()
args1 = [blastn, '-subject', rfile, '-query', tfile.name, '-outfmt',\
'6 sseqid pident length']
blastout = Popen(args1, stdout=PIPE, shell=False).communicate()[0][:-1]
lines = blastout.decode().split('\n')
# add each Blast result to blastres list and label with ASV
for line in lines:
fields = line.split('\t')
fields.append(asvseq)
blastres.append(fields)
os.remove(tfile.name)
os.remove(rfile)
# go through the Blast results and keep the best one for each ASV
best = {}
# initialize best dict with worst possible result
for asv in asvorder: best[asv] = ["no_match", '0.0', '0']
o = open(ofile, 'w')
for fields in blastres:
if len(fields) < 4: continue
o.write("\t".join(fields) + "\t" + str(reclen[fields[0]]) + "\n")
if not fields[3] in best: best[fields[3]] = ["", "0.0", "0"]
tident = float(re.sub("\*", "", best[fields[3]][1]))
# if the identity and length is better than anything we've
# seen so far for this ASV
if float(fields[1]) >= tident and \
int(fields[2]) >= int(best[fields[3]][2]) and \
int(fields[2]) <= reclen[fields[0]]:
best[fields[3]][1] = fields[1]
best[fields[3]][2] = fields[2]
best[fields[3]][0] = fields[0]
# mark imperfect matches
if not int(fields[2]) == reclen[fields[0]] or not \
float(fields[1]) == 100.0:
best[fields[3]][0] += "*"
o.close()
#print best
asvres = []
for asv in asvorder:
myres = best[asv]
myres.append(asv)
asvres.append(myres)
return asvres
def asv_samples(infile, asvres, outfile):
print(Fore.CYAN + Style.BRIGHT + "Writing output...")
firstwrite = [""]
with open(outfile, 'w') as o:
# ['acs_53', '96.000', '180']
for fields in asvres: firstwrite.append(fields[0])
o.write("\t".join(firstwrite) + "\n")
with open(infile) as n:
firstline = True
savefirst = ""
for line in n:
if firstline == True:
firstline = False
savefirst = line.rstrip()
savefirst = re.sub('"', '', savefirst)
continue
elif line.startswith('"Kingdom"'): break
line = "\t".join(line.split('\t'))
line = re.sub('"', '', line)
o.write(line)
o.write("ASV" + savefirst + "\n")
def filter_columns(outfile, filt):
print(Fore.CYAN + Style.BRIGHT + "Filtering columns...")
if filt == False: return
bname, ext = os.path.splitext(outfile)
newname = bname + ".filt.tab"
df = pd.read_csv(outfile, sep='\t', index_col=0)
keepcols = []
for (colname, coldat) in df.iteritems():
if not "no_match" in colname: keepcols.append(colname)
newdf = df.loc[:, keepcols]
newdf.to_csv(newname, sep='\t')
def main():
# setup
args = do_args()
args.infile = os.path.abspath(args.infile)
args.refdir = os.path.abspath(args.refdir)
args.outfile = os.path.abspath(args.outfile)
rlens, reftmp = make_ref_file(args.refdir)
asvresults = type_asvs(args.infile, args.blastout, rlens, reftmp)
asv_samples(args.infile, asvresults, args.outfile)
filter_columns(args.outfile, args.filter)
print(Fore.CYAN + Style.BRIGHT + "Done.")
return 0
if __name__ == "__main__":
sys.exit(main())