This repository has been archived by the owner on Mar 10, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
261 lines (176 loc) · 4.57 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
// WORKFLOW SPECIFICATION
// --------------------------------------------------------------- //
workflow {
// input channels
ch_gbk = Channel
.fromPath( "${params.input_data}/*.gb*" )
ch_fasta = Channel
.fromPath( "${params.input_data}/*.fasta" )
ch_curated_embl = Channel
.fromPath( params.curated_embl )
// Workflow steps
FIX_GENBANK_DEFLINES (
ch_gbk
)
FIX_FASTA_DEFLINES (
ch_fasta
)
CONVERT_GENBANK_TO_EMBL (
FIX_GENBANK_DEFLINES.out
)
ADJUST_TEXT (
CONVERT_GENBANK_TO_EMBL.out,
ch_fasta
)
REMOVE_OC_FEATURES (
ADJUST_TEXT.out.oc
)
OBSCURE_ID (
REMOVE_OC_FEATURES.out
)
SPLIT_EMBL_FILE (
OBSCURE_ID.out.ipd
.mix (
ch_curated_embl
)
)
TAR_EMBL_FILES (
SPLIT_EMBL_FILE.out.collect()
)
}
// --------------------------------------------------------------- //
// DERIVATIVE PARAMETER SPECIFICATION
// --------------------------------------------------------------- //
// Additional parameters that are derived from parameters set in nextflow.config
params.indiv_embl = params.results + "/" + params.experiment_number + "-ipd"
// --------------------------------------------------------------- //
// PROCESS SPECIFICATION
// --------------------------------------------------------------- //
process FIX_GENBANK_DEFLINES {
// In this process, the workflow corrects any definition sections in
// the GenBank file where the allele/locus name is missing. Typically,
// this manifests itself as a series of possible alleles that are not
// preceded by the locus name and a pipe symbol. For most alleles, no
// definition sections will need to be corrected, but if you are exp-
// orting the GanBank file from Geneious, occasionally, incomplete
// definition sections will arise as an artifact.
// publishDir params.results, mode: 'copy'
cpus 1
input:
path gbk
output:
path "*_corrected.gb"
script:
"""
genbank_fixer.R ${gbk}
"""
}
process FIX_FASTA_DEFLINES {
// This process simply removes the pipe symbol and any text that fol-
// lows it, leaving only the allele/locus name
// publishDir params.results, mode: 'copy'
cpus 1
input:
path fasta
output:
path "*.fasta"
script:
"""
fasta_fixer.R ${fasta}
"""
}
process CONVERT_GENBANK_TO_EMBL {
// Here we convert genbank input file to embl file. EMBL file will be
// adjusted in subsequent steps
// publishDir params.results, mode: 'copy'
cpus 1
input:
path gbk
output:
path "*.emb"
script:
"""
convert_genbank_to_embl.py ${gbk} "${params.species}"
"""
}
process ADJUST_TEXT {
// Here we adjust the text in various fields to match IPD requirements.
// This is based on an example IPD EMBL file provided to Roger and John
// Caskey. We save the initial, adjusted EMBL file as oc.emb, since it is
// the one that the O'Connor Host Genomics Team and others will want to
// use for QC.
publishDir params.results, mode: 'copy'
cpus 1
input:
path embl
path fasta
output:
path "oc.emb", emit: oc
path "error.emb", emit: error
script:
"""
adjust_text.py ${embl} ${fasta} ${params.species}
"""
}
process REMOVE_OC_FEATURES {
// IPD does not want Geneious and MES contig identifiers in source anno-
// tations
publishDir params.results, mode: 'copy'
cpus 1
input:
path embl
output:
path "*.emb"
script:
"""
remove_oc_features.py ${embl}
"""
}
process OBSCURE_ID {
// IPD has specific requires for the IPD line formatting, so we remove our
// own information here so they can fill it in later.
publishDir params.results, mode: 'copy'
cpus 1
input:
path embl
output:
path "ipd.emb", emit: ipd
script:
"""
obscure_id.py ${embl}
"""
}
process SPLIT_EMBL_FILE {
// Here, we use a formatting-agnostic method (i.e., not Biopython)
// to split the EMBL file containing all novel sequences into one
// file per sequence, as is required by the IPD submission process.
publishDir params.indiv_embl, mode: 'copy', overwrite: true
cpus 1
input:
path ipd_emb
output:
path "*.embl"
when:
ipd_emb.exists()
script:
"""
ipd_embl_splitter.R ${ipd_emb} ${params.experiment_number}
"""
}
process TAR_EMBL_FILES {
// Finally, we now compress all individual EMBL files into one
// tarball, as is required by the IPD submission process.
publishDir params.results, mode: 'copy', overwrite: true
cpus 1
input:
path embl_files
output:
path "*.tar.gz"
script:
"""
tar -czf ${params.experiment_number}.tar.gz *.embl
"""
}
// --------------------------------------------------------------- //