-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
127 lines (107 loc) · 3.46 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import re
from scripts.import_suppfiles import drop
from snakemake.utils import min_version
min_version("6.0")
EMAIL = "[email protected]"
BLACKLIST_FILE = "blacklist.txt"
p = re.compile("GSE\\d+")
module query:
snakefile:
"query.smk"
use rule * from query as query_*
# Suppfilenames
def get_parsed_suppfiles(wildcards):
SUPPFILENAMES_FILE = checkpoints.query_suppfilenames.get().output[0]
with open(SUPPFILENAMES_FILE, "r") as f:
SUPPFILENAMES = [
os.path.basename(line.rstrip())
for line in f.readlines()
if line.strip() != ""
]
with open(BLACKLIST_FILE) as h:
BLACKLIST = [os.path.basename(line.rstrip()) for line in h.readlines()]
# Drop alignment files, sequence files, binary files, README, etc
SUPPFILENAMES = [
file for file in SUPPFILENAMES if not bool(drop.search(file.lower()))
]
# Drop files in blacklist
SUPPFILENAMES = [file for file in SUPPFILENAMES if file not in BLACKLIST]
return expand(
["output/tmp/parsed_suppfiles__{suppfilename}__.csv"],
suppfilename=SUPPFILENAMES,
)
rule all:
input:
rules.query_all.input,
"output/parsed_suppfiles.csv",
"output/geo-htseq.tar.gz",
default_target: True
localrules: all, merge_parsed_suppfiles
# Download filterd supplementary files
rule download_suppfiles:
output:
temp("suppl/{suppfilename}"),
log:
"log/download__{suppfilename}__.log",
params:
id=lambda wildcards: p.search(wildcards.suppfilename.upper()).group(0),
stub=lambda wildcards: p.search(wildcards.suppfilename.upper()).group(0)[0:-3],
resources:
runtime=lambda wildcards, attempt: attempt * 1440,
shell:
"""
curl -sS -H 'Expect:' -o {output[0]} --user anonymous:{EMAIL} ftp://ftp.ncbi.nlm.nih.gov/geo/series/{params.stub}nnn/{params.id}/{output[0]} 2> {log}
"""
# Import supplementary data
rule import_suppfiles:
input:
"suppl/{suppfilename}",
output:
"output/tmp/parsed_suppfiles__{suppfilename}__.csv",
log:
"log/import__{suppfilename}__.log",
params:
f"--var basemean=10 logcpm=1 rpkm=1 fpkm=1 aveexpr=3.32 --bins 40 --fdr 0.05 --pi0method lfdr --blacklist {BLACKLIST_FILE}",
conda:
"envs/environment.yaml"
resources:
mem_mb=lambda wildcards, input: min(
200000, max([int(32 * (input.size // 1e6)), 4000])
),
runtime=lambda wildcards, attempt, input: min(
1440, attempt * (5 + int(360 * input.size // 1e9))
),
shell:
"""
python3 -u scripts/import_suppfiles.py --file {input} --out {output} {params} 2> {log}
"""
# Script uses base R only, so any modern R version should be good to go.
# Run this script 'locally' in same node as main Snakemake, batch
# submission can cause too long argument error,
# as there are potentially 70K+ files to concatenate.
rule merge_parsed_suppfiles:
input:
get_parsed_suppfiles,
output:
"output/parsed_suppfiles.csv",
resources:
mem_mb=4000,
runtime=120,
envmodules:
"any/R/4.1.2-X"
script:
"scripts/concat_tabs.R"
rule archive:
input:
rules.query_all.input,
"output/parsed_suppfiles.csv",
output:
"output/geo-htseq.tar.gz",
resources:
mem_mb=16000,
runtime=120,
shell:
"""
tar -czvf {output[0]} {input}
"""