-
Notifications
You must be signed in to change notification settings - Fork 0
/
FilterDADA2Tab
124 lines (115 loc) · 4.65 KB
/
FilterDADA2Tab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
# This is the fifth script in the PopMLST pipeline. Run it
# without arguments to get the usage. Please provide
# proper attribution if you use this code.
#
# https://github.com/marade/PopMLST
#
# author: M Radey (email: marad_at_uw.edu)
import os, sys, argparse, re
import pandas as pd
from colorama import init as cinit
from colorama import Fore, Back, Style
cinit(autoreset=True)
version = "1.0.0"
def do_args():
desc = "Filter our custom DADA2 ASV table format such that putative " +\
"novel alleles that closely resemble another allele signal in a " +\
"sample are discarded"
parser = argparse.ArgumentParser(prog=os.path.basename(__file__),\
description=desc)
parser.add_argument("infile", help="specifies the input file")
parser.add_argument("outfile", help="specifies the output file")
return parser.parse_args()
def filter_columns(infile, outfile):
print(Fore.CYAN + Style.BRIGHT + "Filtering columns...")
df = pd.read_csv(infile, sep='\t', index_col=0)
#print(df.columns)
#create a dictionary of column names for the first round of filtering
coldict = {}
for col in df.columns:
tcol1 = re.sub('\*', '', col)
tcol2 = re.sub('\.\d+$', '', tcol1)
if not tcol2 in coldict:
coldict[tcol2] = [col]
else: coldict[tcol2].append(col)
dropcols = []
# do the comparisons to determine which columns are dropped in
# the first round
for xcol in coldict.keys():
for thiscol in coldict[xcol]:
if not '*' in thiscol:
for thatcol in coldict[xcol]:
if thiscol == thatcol: continue
if '*' in thatcol:
# now we're ready to answer, do we want to
# keep thatcol?
killthatcol = True
for thisval, thatval in df[[thiscol,\
thatcol]].values.tolist():
if not thisval.isdigit() or not thatval.isdigit():
continue
#print(thiscol, thatcol, thisval, thatval)
if (int(thatval) > int(thisval)) and \
int(thatval) > 5:
killthatcol = False
break
if killthatcol == True:
#print("Kill: " + thatcol)
dropcols.append(thatcol)
#create a dictionary of column names for the second round of filtering
coldict = {}
for col in df.columns:
if not '*' in col: continue
if col in dropcols: continue
tcol1 = re.sub('\.\d+$', '', col)
if not tcol1 in coldict:
coldict[tcol1] = [col]
else: coldict[tcol1].append(col)
# do the comparisons to determine which columns are dropped in
# the second round
for xcol in coldict.keys():
if len(coldict[xcol]) < 2: continue
#print(coldict[xcol])
for thiscol in coldict[xcol]:
for thatcol in coldict[xcol]:
if thiscol == thatcol: continue
# now we're ready to answer, do we want to
# keep thatcol?
killthatcol = True
for thisval, thatval in df[[thiscol,\
thatcol]].values.tolist():
if not thisval.isdigit() or not thatval.isdigit():
continue
#print(thiscol, thatcol, thisval, thatval)
if (int(thatval) > int(thisval)) and \
int(thatval) > 5:
killthatcol = False
break
if killthatcol == True:
#print("Kill: " + thatcol)
dropcols.append(thatcol)
# for the third round, filter columns where no value is >5
for col in df.columns:
#print(col)
highnum = 0
for thisval in df[col].values.tolist():
#if not isinstance(thisval, float): continue
if not thisval.isdigit(): continue
if int(thisval) > highnum: highnum = int(thisval)
#print(thisval, str(highnum))
if highnum < 6: dropcols.append(col)
#print("Dropping:")
#print(dropcols)
df = df.drop(dropcols, axis=1)
df.to_csv(outfile, sep='\t')
def main():
# setup
args = do_args()
args.outfile = os.path.abspath(args.outfile)
args.infile = os.path.abspath(args.infile)
filter_columns(args.infile, args.outfile)
print(Fore.CYAN + Style.BRIGHT + "Done.")
return 0
if __name__ == "__main__":
sys.exit(main())