forked from DISCASM/DISCASM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DISCASM
executable file
·280 lines (202 loc) · 12 KB
/
DISCASM
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
#!/usr/bin/env python
import os, re, sys
import argparse
import subprocess
sys.path.insert(0, os.path.sep.join([os.path.dirname(os.path.realpath(__file__)), "PyLib"]))
from Pipeliner import Pipeliner, Command
import logging
FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n"
global logger
logger = logging.getLogger()
logging.basicConfig(filename='FusionInspector.log', format=FORMAT, filemode='w', level=logging.DEBUG)
# add a new Handler to print all INFO and above messages to stdout
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
logger.addHandler(ch)
"""
______ _____ _____ _____ ___ ________ ___
| _ \_ _/ ___/ __ \ / _ \ / ___| \/ |
| | | | | | \ `--.| / \// /_\ \\ `--.| . . |
| | | | | | `--. \ | | _ | `--. \ |\/| |
| |/ / _| |_/\__/ / \__/\| | | |/\__/ / | | |
|___/ \___/\____/ \____/\_| |_/\____/\_| |_/
"""
UTILDIR = os.path.abspath(os.sep.join([os.path.dirname(__file__), "util"]))
MAX_PCT_ALIGNED_OK = 80
# 2017-10-23
# Cicada Dennis added code which looks for the location of the Trinity program using the Unix "which" utility.
# Previous code which is replaced:
#if 'TRINITY_HOME' not in os.environ:
# raise RuntimeError("must set TRINITY_HOME env var")
#TRINITY_HOME = os.environ['TRINITY_HOME']
TRINITY_HOME = ""
TRINITY_HOME_error_msg = "Before running {0}, you must set the environment variable TRINITY_HOME\n".format(sys.argv[0]) + \
"\tto the base installation directory of Trinity,\n\tor that directory needs to be in the PATH.\n"
if 'TRINITY_HOME' in os.environ:
TRINITY_HOME = os.environ['TRINITY_HOME']
else:
# if hasattr(os, 'symlink'): # symlink was implemented to always return false when it was not implemented in early python.
# Not using symlink. Using os.path.islink() and os.readlink().
try:
# I tried using "command -v Trinity" but for some reason, I was getting an OS permission error with that.
# distutils.spawn.find_executable() also might work, I but already implemented the below.
pipe1 = subprocess.Popen(["which", "Trinity"], stdout=subprocess.PIPE)
except:
sys.stderr.write(TRINITY_HOME_error_msg)
# t, v, tb = sys.exc_info()
# raise t, v, tb
# For some reason the above was giving a syntax error.
# A simple raise should reraise the existing exception.
raise
else:
TrinityPath, err_info = pipe1.communicate()
# FIX - probably should be checking err_info for errors...
#print "err_info is:"
#print err_info
# Determine TRINITY_HOME from the TrinityPath returned.
# If TrinityPath is a link, we need to dereference the link.
TrinityPath = TrinityPath.rstrip() # Need to strip off a newline.
if len(TrinityPath) > 0:
# print "Trinity that was found is: {:s}".format(repr(TrinityPath))
# print os.path.islink(TrinityPath)
TrinityPath = os.path.abspath(TrinityPath)
# msg = "The Absolute Trinity path that was found is: {:s}".format(TrinityPath)
# print msg
# print os.path.islink(TrinityPath)
while os.path.islink(TrinityPath):
# print "That path is a link."
TrinityPath = os.path.join(os.path.dirname(TrinityPath),os.readlink(TrinityPath))
# print "The new path is: {:s}".format(TrinityPath)
# Take off the last part of the path (which is the Trinity command)
TRINITY_HOME = "/".join(TrinityPath.split("/")[0:-1])
os.environ['TRINITY_HOME'] = TRINITY_HOME
sys.stdout.write("TRINITY_HOME has been set to: {:s}.\n".format(TRINITY_HOME))
# else: # There was no value returned by the which command. So Trinity is not in the PATH.
# Doing nothing leaves TRINITY_HOME as an empty string.
# end of else no TRINITY_HOME environment variable.
# If TRINITY_HOME didn't get set, it will still be an empty string.
if TRINITY_HOME == "":
sys.stderr.write(TRINITY_HOME_error_msg)
raise RuntimeError("must set TRINITY_HOME env var")
class DISCASM:
def run(self):
arg_parser = argparse.ArgumentParser(
description = "Performs de novo transcriptome assembly on discordant and unmapped reads"
)
arg_parser.add_argument("--chimeric_junctions", dest="chimeric_junctions",
required=True, help="STAR Chimeric.out.junction file")
arg_parser.add_argument("--aligned_bam", dest="aligned_bam_filename",
required=False, help="aligned bam file from your favorite rna-seq alignment tool")
arg_parser.add_argument("--left_fq", dest="left_fq_filename", required=True, help="left fastq file")
arg_parser.add_argument("--right_fq", dest="right_fq_filename", required=True, help="right fastq file")
arg_parser.add_argument("--out_dir", dest="str_out_dir", required=True, help="output directory")
arg_parser.add_argument("--denovo_assembler", dest="denovo_assembler", required=True,
help="de novo assembly method: Trinity|Oases|OasesMultiK")
arg_parser.add_argument("--add_trinity_params", dest="add_trinity_params", required=False,
help="any additional parameters to pass on to Trinity if Trinity is the chosen assembler.")
arg_parser.add_argument("--normalize_reads", default=False, action='store_true',
help='perform in silico normalization prior to de novo assembly (not needed if using Trinity, since Trinity performs normalization internally')
args_parsed = arg_parser.parse_args()
aligned_bam_filename = None
if args_parsed.aligned_bam_filename:
aligned_bam_filename = os.path.abspath(args_parsed.aligned_bam_filename)
chimeric_junctions_filename = os.path.abspath(args_parsed.chimeric_junctions)
left_fq_filename = os.path.abspath(args_parsed.left_fq_filename)
right_fq_filename = os.path.abspath(args_parsed.right_fq_filename)
denovo_assembler = args_parsed.denovo_assembler
if not re.search("(trinity|oases)", denovo_assembler, re.I):
raise Exception("Error, assembler: " + denovo_assembler +
" is not recognized. Only 'Trinity' and 'Oases' are currently supported.")
ensure_locate_progs(denovo_assembler)
args_parsed.str_out_dir = os.path.abspath(args_parsed.str_out_dir)
str_out_dir = args_parsed.str_out_dir
if not os.path.isdir(str_out_dir):
os.makedirs(str_out_dir)
os.chdir(str_out_dir)
check_files_list = [ chimeric_junctions_filename, left_fq_filename, right_fq_filename]
if aligned_bam_filename:
check_files_list.append(aligned_bam_filename)
## Extract the discordant and unmapped reads into fastq files
checkpoints_dir = args_parsed.str_out_dir + "/chckpts_dir"
checkpoints_dir = os.path.abspath(checkpoints_dir)
if not os.path.exists(checkpoints_dir):
os.makedirs(checkpoints_dir)
## Construct pipeline
pipeliner = Pipeliner(checkpoints_dir)
if aligned_bam_filename:
## Using both the Chimeric reads and those reads that failed to map to the genome.
cmdstr = str( os.sep.join([UTILDIR, "retrieve_SF_chimeric_and_unmapped_reads.py"]) +
" " + aligned_bam_filename +
" " + chimeric_junctions_filename +
" " + left_fq_filename + " " + right_fq_filename )
discordant_left_fq_filename = os.path.basename(left_fq_filename) + ".extracted.fq"
discordant_right_fq_filename = os.path.basename(right_fq_filename) + ".extracted.fq"
pipeliner.add_commands([Command(cmdstr, "extract_chimeric_unmapped.ok")])
else:
## Just the chimeric reads as per STAR
cmdstr = str( os.sep.join([UTILDIR, "retrieve_SF_chimeric_reads.py"]) +
" " + chimeric_junctions_filename +
" " + left_fq_filename + " " + right_fq_filename )
discordant_left_fq_filename = os.path.basename(left_fq_filename) + ".extracted.fq"
discordant_right_fq_filename = os.path.basename(right_fq_filename) + ".extracted.fq"
pipeliner.add_commands([Command(cmdstr, "extract_chimeric_only.ok")])
# in silico normalization
if args_parsed.normalize_reads and not re.match("trinity", denovo_assembler, re.I):
# Trinity normalizes by default
cmdstr = str(os.path.sep.join([TRINITY_HOME, "util", "insilico_read_normalization.pl"]) +
" --left " + discordant_left_fq_filename +
" --right " + discordant_right_fq_filename +
" --seqType fq --JM 20G --max_cov 50 ")
normalized_left_fq_filename = discordant_left_fq_filename + ".normalized_K25_C50_pctSD200.fq"
normalized_right_fq_filename = discordant_right_fq_filename + ".normalized_K25_C50_pctSD200.fq"
pipeliner.add_commands([Command(cmdstr, "insilinorm.ok")])
discordant_left_fq_filename = normalized_left_fq_filename
discordant_right_fq_filename = normalized_right_fq_filename
#######################################
## De novo assemble the extracted reads
assembled_transcripts_filename = None
if re.match("trinity", denovo_assembler, re.I):
cmdstr = str("Trinity --seqType fq --max_memory 10G " +
" --left " + discordant_left_fq_filename +
" --right " + discordant_right_fq_filename +
" --CPU 4 --output trinity_out_dir " +
" --min_contig_length 100")
if (args_parsed.add_trinity_params):
cmdstr += " " + args_parsed.add_trinity_params
assembled_transcripts_filename = "trinity_out_dir.Trinity.fasta"
pipeliner.add_commands([Command(cmdstr, "trinity_assembly.ok")])
elif re.match("oasesmultik", denovo_assembler, re.I):
# run Oases multi-K (Jaffa-style)
cmdstr = str(UTILDIR + "/run_OASES_multiK.pl --left_fq " + discordant_left_fq_filename +
" --right_fq " + discordant_right_fq_filename +
" -K 19,36,4 -M 27 -L 100 " +
" -O oasesMultiK_out_dir ")
assembled_transcripts_filename = "oasesMultiK_out_dir/oases.transcripts.fa"
pipeliner.add_commands([Command(cmdstr, "oases_multiK_assembly.ok")])
else:
# oases
cmdstr = str(UTILDIR + "/run_OASES.pl " + discordant_left_fq_filename +
" " + discordant_right_fq_filename + " oases_out_dir ")
assembled_transcripts_filename = "oases_out_dir/oases.transcripts.fa"
pipeliner.add_commands([Command(cmdstr, "oases_asm.ok")])
pipeliner.run()
def ensure_locate_progs(assembler_name):
progs = []
if (re.search("trinity", assembler_name, re.I)):
progs.append('Trinity')
elif (re.search("oases", assembler_name, re.I)):
progs += ['oases', 'velvetg', 'velveth']
missing_prog = False
for prog in progs:
try:
path = subprocess.check_output("which " + prog, shell=True)
print >>sys.stderr, "Found prog: " + prog + " at path: " + path
except Exception as e:
print >>sys.stderr, "Error, cannot locate required program: " + prog
missing_prog = True
if missing_prog:
raise Exception("Error, missing at least one required program available via PATH setting")
return
if __name__ == "__main__":
# Needed to run, calls the script
DISCASM().run()