-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
executable file
·322 lines (252 loc) · 11 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
/*
Nextflow -- Analysis Pipeline to annotate hypothetical genes from PROKKA
Authors: Kaffee-Max, Eva, Martin
*/
/**************************
* META & HELP MESSAGES
**************************/
/**************************
* Help messages, user inputs & checks
**************************/
// help message
if (params.help) { exit 0, helpMSG() }
// Log infos based on user inputs
defaultMSG()
// error codes
if (params.profile) {
exit 1, "--profile is WRONG use -profile" }
if (workflow.profile == 'standard') {
"NO EXECUTION PROFILE SELECTED, using [-profile local,docker]" }
if (!params.fasta) {
exit 1, "input missing, use [--fasta]"}
/**************************
* INPUT CHANNELS
**************************/
// fasta input & --list support
if (params.fasta && params.list) { fasta_input_ch = Channel
.fromPath( params.fasta, checkIfExists: true )
.splitCsv()
.map { row -> [row[0], file("${row[1]}", checkIfExists: true)] }
}
else if (params.fasta) { fasta_input_ch = Channel
.fromPath( params.fasta, checkIfExists: true)
.map { file -> tuple(file.baseName, file) }
}
/**************************
* PROCESSES
**************************/
/* include processes that should be used outside of a sub-workflow logic */
include { rename } from './process/rename'
include { prokka_annotation } from './process/prokka_annotation'
include { restore } from './process/restore'
include { query_fasta } from './process/query_fasta'
include { download_db } from './process/download_db'
//include { create_query_db ; create_query_db as create_target_db } from './process/create_query_db'
include { create_query_db } from './process/create_query_db'
include { create_target_db } from './process/create_target_db'
include { index_target_db } from './process/index_target_db'
include { mmseqs2 } from './process/mmseqs2'
include { update_prokka } from './process/update_prokka'
include { summary } from './process/summary'
/**************************
* DATABASES
**************************/
/* Comment section:
The Database Section is designed to "auto-get" pre prepared databases.
It is written for local use and hpc/cloud use via params.cloudProcess.
*/
workflow get_db {
main:
if (!params.customdb) { db_preload = file("${params.databases}/${params.database}/${params.database}.fasta") }
else { db_preload = file("${params.customdb}") }
// local storage via storeDir
if (!params.cloudProcess) {
if ( db_preload.exists() ) { db = db_preload}
else { download_db(); db = download_db.out.db }
}
// cloud storage via db_preload.exists()
if (params.cloudProcess) {
if (db_preload.exists()) { db = db_preload }
else { download_db(); db = download_db.out.db }
}
emit:
db
}
workflow create_mmseqs2_targetdb {
take:
query_db
main:
targetdb = file("${params.databases_indices}/target_db.tar.gz")
if ( !targetdb.exists() ) { create_target_db(query_db, "target_db"); targetdb_ch = create_target_db.out.output}
else { targetdb_ch = channel.of(["target_db", targetdb])}
index = file("${params.databases_indices}/target_db_index.tar.gz")
tmp = file("${params.databases_indices}/tmp.tar.gz")
if ( !index.exists() || !tmp.exists() ) { index_target_db(targetdb_ch); targetdb_index_ch = index_target_db.out.output }
else {targetdb_index_ch = channel.of(["target_db", index, tmp])}
emit:
targetdb_ch
targetdb_index_ch
}
workflow create_mmseqs2_querydb {
take:
query_fasta
main:
querydb_ch = file("${params.databases_indices}/${query_fasta.first()}_query_db.tar.gz")
if ( !querydb_ch.exists() ) { create_query_db(query_fasta, "query_db"); querydb_ch = create_query_db.out.output }
emit:
querydb_ch
}
/**************************
* SUB-WORKFLOWS
**************************/
/**************************
* MAIN WORKFLOW ENTRY POINT
**************************/
workflow {
// rename contig IDs for prokka annotation
rename(fasta_input_ch)
renamed_contigs = rename.out.renamed_contigs
rename_map = rename.out.contig_map
renamed_contigs.view()
// run prokka annotation
prokka_annotation(renamed_contigs)
prokka_out_ch = prokka_annotation.out.output
// restore original contig IDs
restore(prokka_out_ch.join(rename_map))
restored_prokka_contigs = restore.out.restored_contigs
// create input fasta for mmseqs2
query_fasta(restored_prokka_contigs)
query_fasta_out_ch = query_fasta.out.queryfasta
log1 = query_fasta.out.log
hyprot_dicts_ch = query_fasta.out.hyprot_dicts
// download query database for mmseqs2
get_db()
query_db = get_db.out.db
// prepare databases for mmseqs2
create_mmseqs2_targetdb(query_db)
create_mmseqs2_querydb(query_fasta_out_ch)
mmseqs2_querydb = create_mmseqs2_querydb.out.querydb_ch
mmseqs2_targetdb = create_mmseqs2_targetdb.out.targetdb_ch
mmseqs2_targetdb_index = create_mmseqs2_targetdb.out.targetdb_index_ch
// run mmseqs2
mmseqs2_targetdb_ch = mmseqs2_targetdb.join(mmseqs2_targetdb_index)
mmseqs2_in_ch = mmseqs2_querydb.combine(mmseqs2_targetdb_ch)
mmseqs2(mmseqs2_in_ch)
id_alninfo = mmseqs2.out.output
// update prokka annotations
update_ch = restored_prokka_contigs.join(hyprot_dicts_ch).join(id_alninfo)
update_prokka(update_ch)
log2 = update_prokka.out.log
// produce hypro summary
summary(log1, log2, id_alninfo)
summary_ch = summary.out
summary_ch.collectFile(name: "hypro_summary_db${params.database}_e${params.evalue}_a${params.minalnlen}_p${params.pident}.txt", storeDir: "${params.output}")
}
/*************
* OUTPUT
*************/
workflow.onComplete {
summary = """"""
myFile= file("${params.output}/hypro_summary_db${params.database}_e${params.evalue}_a${params.minalnlen}_p${params.pident}.txt")
myReader = myFile.newReader()
String line
while( line = myReader.readLine() ) {
if (line.startsWith('***')) { summary = summary + "\n" + line + "\n" }
else { summary = summary + line + "\n" }
}
myReader.close()
log.info """
Execution status: ${ workflow.success ? 'OK' : 'failed' }
______________________________________
\u001B[36mExecution summary\033[0m
______________________________________
$summary
Summary report: ${params.output}/hypro_summary_db${params.database}_e${params.evalue}_a${params.minalnlen}_p${params.pident}.txt
Updated annotation files: ${params.output}/SAMPLE_ID/mmseqs2_run_db${params.database}_e${params.evalue}_a${params.minalnlen}_p${params.pident}/prokka_restored_updated/
______________________________________
Thanks for using HYPRO!
Please cite: https://github.com/hoelzer-lab/hypro
""".stripIndent()
}
/*************
* --help
*************/
def helpMSG() {
c_green = "\033[0;32m";
c_reset = "\033[0m";
c_yellow = "\033[0;33m";
c_blue = "\033[0;34m";
c_dim = "\033[2m";
log.info """
____________________________________________________________________________________________
Workflow: HyPro
${c_yellow}Usage example:${c_reset}
nextflow run wf_template --fasta '*/*.fasta'
${c_yellow}Input:${c_reset}
${c_green} --fasta ${c_reset} '*.fasta' -> One sample per file
${c_dim} ..change above input to csv with:${c_reset} ${c_green}--list${c_reset} true
${c_yellow}Parameters:${c_reset}
--output Name of the result folder [default: $params.output].
--database Specify the target db to search for annotation extension.
Current available options: uniprotkb, uniref50, uniref90, uniref100, pdb.
Note, that searching on uniref DBs will significantly extend runtime of HyPro.
[default: $params.database]
--customdb Specify a path to an existing DB. If no DB is found, HyPro will build it.
Requires an according -d configuration.
--modus Choose the modus of HyPro to search all hypothetical proteins (full) or leave
those out which gained partial annotation (restricted). The dinstinction of
fully un-annotated and partial annotated hypothetical proteins was observed for
uniprot annotations. Options: [full, restricted] [default: $params.modus]
--threads Define the number of threads to use by mmseqs indexdb, search and convertalis.
--evalue Include sequence matches with < e-value threshold into the profile.
Requires a FLOAT >= 0.0. [default: $params.evalue]
--minalnlen Specify the minimum alignment length as INT in range 0 to MAX aln length.
[default: $params.minalnlen]
--pident List only matches above this sequence identity for clustering.
Enter a FLOAT between 0 and 1.0. [default: $params.pident]
--prokka Control parameters for prokka,e.g. if running HyPro on a bacteria genome that
does not follow the standard code.
${c_yellow}Options:${c_reset}
--cores max cores per process for local use [default: $params.cores]
--max_cores max cores per machine for local use [default: $params.max_cores]
--memory max memory for local use [default: $params.memory]
${c_dim}Nextflow options:
-with-report rep.html cpu / ram usage (may cause errors)
-with-dag chart.html generates a flowchart for the process tree
-with-timeline time.html timeline (may cause errors)
${c_yellow}LSF computing:${c_reset}
For execution of the workflow on a HPC with LSF adjust the following parameters:
--databases defines the path where databases are stored [default: $params.databases]
--workdir defines the path where nextflow writes tmp files [default: $params.workdir]
--cachedir defines the path where conda environments are cached [default: $params.cachedir]
${c_yellow}Profiles:${c_reset}
-profile local,conda
local,docker
local,singularity
${c_reset}
""".stripIndent()
}
def defaultMSG() {
c_blue = "\u001B[36m"
c_reset = "\033[0m"
log.info """
______________________________________
${c_blue}Workflow: HYPRO${c_reset}
______________________________________
Profile: $workflow.profile
Current User: $workflow.userName
Nextflow-version: $nextflow.version
Starting time: $nextflow.timestamp
Workflow hash: $workflow.commitId
--workdir $params.workdir
--databases $params.databases
--output $params.output
--cores $params.cores
--max_cores $params.max_cores
--memory $params.memory
--cachedir $params.cachedir
______________________________________
""".stripIndent()
}