Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RNAパイプラインでGenomonSVを実行可能にする #13

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions example_conf/param_rna_ecsub.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,33 @@
[general]
instance_option = --aws-log-group-name genomon

[bwa_alignment]
resource = --aws-ec2-instance-type t2.2xlarge --disk-size 80
image = genomon/bwa_alignment:0.2.0
bamtofastq_option = collate=1 combs=1 exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1
bwa_option = -t 8 -T 0
bwa_reference_dir = s3://genomon-bucket/_GRCh37/reference/GRCh37
bwa_reference_file = GRCh37.fa
bamsort_option = index=1 level=1 inputthreads=2 outputthreads=2 calmdnm=1 calmdnmrecompindentonly=1
bammarkduplicates_option = markthreads=2 rewritebam=1 rewritebamlevel=1 index=1 md5=1

[sv_parse]
resource = --aws-ec2-instance-type t2.large --disk-size 15
image = genomon/genomon_sv:0.1.0
genomon_sv_parse_option =

[sv_merge]
resource = --aws-ec2-instance-type t2.large --disk-size 15
image = genomon/genomon_sv:0.1.0
genomon_sv_merge_option =

[sv_filt]
resource = --aws-ec2-instance-type t2.large --disk-size 50
image = genomon/genomon_sv:0.1.0
reference = s3://genomon-bucket/_GRCh37/reference/GRCh37/GRCh37.fa
genomon_sv_filt_option =--grc --min_junc_num 2 --max_control_variant_read_pair 10 --min_overhang_size 30
sv_utils_filt_option = --min_tumor_allele_freq 0.07 --max_control_variant_read_pair 1 --control_depth_thres 10 --inversion_size_thres 1000

[star_alignment]
resource = --aws-ec2-instance-type t2.2xlarge --disk-size 128
image = genomon/star_alignment
Expand Down
27 changes: 27 additions & 0 deletions example_conf/param_rna_ecsub_spot.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,33 @@
[general]
instance_option = --spot --aws-log-group-name genomon

[bwa_alignment]
resource = --aws-ec2-instance-type t2.2xlarge --disk-size 80
image = genomon/bwa_alignment:0.2.0
bamtofastq_option = collate=1 combs=1 exclude=QCFAIL,SECONDARY,SUPPLEMENTARY tryoq=1
bwa_option = -t 8 -T 0
bwa_reference_dir = s3://genomon-bucket/_GRCh37/reference/GRCh37
bwa_reference_file = GRCh37.fa
bamsort_option = index=1 level=1 inputthreads=2 outputthreads=2 calmdnm=1 calmdnmrecompindentonly=1
bammarkduplicates_option = markthreads=2 rewritebam=1 rewritebamlevel=1 index=1 md5=1

[sv_parse]
resource = --aws-ec2-instance-type t2.large --disk-size 15
image = genomon/genomon_sv:0.1.0
genomon_sv_parse_option =

[sv_merge]
resource = --aws-ec2-instance-type t2.large --disk-size 15
image = genomon/genomon_sv:0.1.0
genomon_sv_merge_option =

[sv_filt]
resource = --aws-ec2-instance-type t2.large --disk-size 50
image = genomon/genomon_sv:0.1.0
reference = s3://genomon-bucket/_GRCh37/reference/GRCh37/GRCh37.fa
genomon_sv_filt_option =--grc --min_junc_num 2 --max_control_variant_read_pair 10 --min_overhang_size 30
sv_utils_filt_option = --min_tumor_allele_freq 0.07 --max_control_variant_read_pair 1 --control_depth_thres 10 --inversion_size_thres 1000

[star_alignment]
resource = --aws-ec2-instance-type-list t3.2xlarge,m4.2xlarge,t2.2xlarge --disk-size 128
image = genomon/star_alignment
Expand Down
3 changes: 3 additions & 0 deletions example_conf/sample_rna.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
[fastq]
MCF-7,s3://genomon-bucket/sample/rna/tumor/tumor.sequence1.fastq,s3://genomon-bucket/sample/rna/tumor/tumor.sequence2.fastq

[sv_detection]
MCF-7,None,None

[fusion]
MCF-7,None

Expand Down
40 changes: 39 additions & 1 deletion genomon_pipeline_cloud/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,49 @@ def run(args):
# RNA
if args.analysis_type == "rna":

import genomon_pipeline_cloud.tasks.bwa_alignment as bwa_alignment
import genomon_pipeline_cloud.tasks.sv_parse as sv_parse
import genomon_pipeline_cloud.tasks.sv_merge as sv_merge
import genomon_pipeline_cloud.tasks.sv_filt as sv_filt
import genomon_pipeline_cloud.tasks.star_alignment as star_alignment
import genomon_pipeline_cloud.tasks.fusion_count as fusion_count
import genomon_pipeline_cloud.tasks.fusion_merge as fusion_merge
import genomon_pipeline_cloud.tasks.fusionfusion as fusionfusion
import genomon_pipeline_cloud.tasks.genomon_expression as genomon_expression
import genomon_pipeline_cloud.tasks.intron_retention as intron_retention

do_run_sv = sample_conf.sv_detection is not None and \
len(sample_conf.sv_detection) > 0

if do_run_sv:
bwa_alignment_task = \
bwa_alignment.Bwa_alignment(args.output_dir,
tmp_dir,
sample_conf,
param_conf,
run_conf)
sv_parse_task = \
sv_parse.SV_parse(args.output_dir,
tmp_dir,
sample_conf,
param_conf,
run_conf)
sv_merge_task = \
sv_merge.SV_merge(args.output_dir,
tmp_dir,
sample_conf,
param_conf,
run_conf)
sv_filt_task = \
sv_filt.SV_filt(args.output_dir, tmp_dir, sample_conf,
param_conf, run_conf)
p_sv = multiprocessing.Process(target=batch_engine.seq_execute,
args=([bwa_alignment_task,
sv_parse_task,
sv_merge_task,
sv_filt_task],))
p_sv.start()

star_alignment_task = star_alignment.Star_alignment(args.output_dir, tmp_dir, sample_conf, param_conf, run_conf)
fusion_count_task = fusion_count.Fusion_count(args.output_dir, tmp_dir, sample_conf, param_conf, run_conf)
fusion_merge_task = fusion_merge.Fusion_merge(args.output_dir, tmp_dir, sample_conf, param_conf, run_conf)
Expand All @@ -91,7 +127,9 @@ def run(args):
p_fusion.join()
p_expression.join()
p_ir.join()

if do_run_sv:
p_sv.join()


##########
# DNA
Expand Down
27 changes: 19 additions & 8 deletions genomon_pipeline_cloud/sample_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ class Sample_conf(object):

def __init__(self):

self.bam_file = {}
self.bwa_bam_file = {}
self.star_bam_file = {}

self.fastq = {}
self.bam_tofastq = {}
Expand Down Expand Up @@ -196,9 +197,10 @@ def parse_data(self, _data, output_dir, analysis_type):
"""
self.fastq[sampleID] = [sequence1, sequence2]
if analysis_type == "rna":
self.bam_file[sampleID] = output_dir+"/star/"+sampleID+"/"+sampleID+".Aligned.sortedByCoord.out.bam"
self.star_bam_file[sampleID] = output_dir+"/star/"+sampleID+"/"+sampleID+".Aligned.sortedByCoord.out.bam"
self.bwa_bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"
elif analysis_type == "dna":
self.bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"
self.bwa_bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"

elif mode == 'bam_tofastq':

Expand Down Expand Up @@ -229,9 +231,10 @@ def parse_data(self, _data, output_dir, analysis_type):
"""
self.bam_tofastq[sampleID] = sequences
if analysis_type == "rna":
self.bam_file[sampleID] = output_dir+"/star/"+sampleID+"/"+sampleID+".Aligned.sortedByCoord.out.bam"
self.star_bam_file[sampleID] = output_dir+"/star/"+sampleID+"/"+sampleID+".Aligned.sortedByCoord.out.bam"
self.bwa_bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"
elif analysis_type == "dna":
self.bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"
self.bwa_bam_file[sampleID] = output_dir+"/bam/"+sampleID+"/"+sampleID+".markdup.bam"

elif mode == 'bam_import':

Expand All @@ -247,8 +250,12 @@ def parse_data(self, _data, output_dir, analysis_type):

sampleID_list.append(sampleID)

if len(row) != 2:
err_msg = sampleID + ": only one bam file is allowed"
if (analysis_type == 'dna' and len(row) == 2) or \
(analysis_type == 'rna' and len(row) == 3):
if analysis_type == 'dna':
err_msg = sampleID + ": only one bam file is allowed"
elif analysis_type == 'rna':
err_msg = sampleID + ": STAR and BWA bam files are required"
raise ValueError(err_msg)

sequence = row[1]
Expand All @@ -266,7 +273,11 @@ def parse_data(self, _data, output_dir, analysis_type):
"""

self.bam_import[sampleID] = sequence
self.bam_file[sampleID] = sequence
if analysis_type == "rna":
self.bwa_bam_file[sampleID] = row[2]
self.star_bam_file[sampleID] = sequence
elif analysis_type == "dna":
self.bwa_bam_file[sampleID] = sequence


elif mode == 'mutation_call':
Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/fusion_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru

for sample in control_sample_li_uniq:

bam = sample_conf.bam_file[sample]
bam = sample_conf.star_bam_file[sample]
bam_dir = os.path.dirname(bam)

hout.write('\t'.join([sample,
Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/fusionfusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru

for sample, panel_name in sample_conf.fusion:

bam = sample_conf.bam_file[sample]
bam = sample_conf.star_bam_file[sample]
bam_dir = os.path.dirname(bam)

record = [sample,
Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/genomon_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru
+ "\n")
for sample in sample_conf.expression:

bam = sample_conf.bam_file[sample]
bam = sample_conf.star_bam_file[sample]
bam_dir = os.path.dirname(bam)
bam_file = os.path.basename(bam)

Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/genomon_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru

for sample in sample_conf.qc:

bam = sample_conf.bam_file[sample]
bam = sample_conf.bwa_bam_file[sample]
bam_dir = os.path.dirname(bam)
bam_file = os.path.basename(bam)

Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/intron_retention.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru

for sample in sample_conf.intron_retention:

bam = sample_conf.bam_file[sample]
bam = sample_conf.star_bam_file[sample]
bam_dir = os.path.dirname(bam)
bam_file = os.path.basename(bam)

Expand Down
4 changes: 2 additions & 2 deletions genomon_pipeline_cloud/tasks/mutation_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru
sample_normal = sample[1] if sample[1] != None else "None"
control_panel = sample[2] if sample[2] != None else "None"

tumor_bam = sample_conf.bam_file[sample_tumor]
tumor_bam = sample_conf.bwa_bam_file[sample_tumor]
tumor_bam_dir = os.path.dirname(tumor_bam)
tumor_bam_file = os.path.basename(tumor_bam)

normal_bam_dir = ""
normal_bam_file = ""
if sample_normal != "None":
normal_bam = sample_conf.bam_file[sample_normal]
normal_bam = sample_conf.bwa_bam_file[sample_normal]
normal_bam_dir = os.path.dirname(normal_bam)
normal_bam_file = os.path.basename(normal_bam)

Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/paplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def to_oneliner_starqc(tag, stage, bam_files, output_suffix):
items = {"star": ["", ""], "fusion": ["", ""], "qc": ["", ""], "sv": ["", ""], "mutation": ["", ""], "signature": ["", ""], "pmsignature": ["", ""]}

if run_conf.analysis_type == "rna":
items["star"] = to_oneliner_starqc("starqc", sample_conf.qc, sample_conf.bam_file, ".Log.final.out")
items["star"] = to_oneliner_starqc("starqc", sample_conf.qc, sample_conf.star_bam_file, ".Log.final.out")
items["fusion"] = to_oneliner("fusion", sample_conf.fusion, output_dir + "/fusion", ".genomonFusion.result.filt.txt")

elif run_conf.analysis_type == "dna":
Expand Down
4 changes: 2 additions & 2 deletions genomon_pipeline_cloud/tasks/sv_filt.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru

for tumor_sample, normal_sample, control_panel_name in sample_conf.sv_detection:

tumor_bam = sample_conf.bam_file[tumor_sample]
tumor_bam = sample_conf.bwa_bam_file[tumor_sample]
tumor_bam_dir = os.path.dirname(tumor_bam)
tumor_bam_file = os.path.basename(tumor_bam)

normal_bam_dir = ""
normal_bam_file = ""
if normal_sample is not None:
normal_bam = sample_conf.bam_file[normal_sample]
normal_bam = sample_conf.bwa_bam_file[normal_sample]
normal_bam_dir = os.path.dirname(normal_bam)
normal_bam_file = os.path.basename(normal_bam)

Expand Down
2 changes: 1 addition & 1 deletion genomon_pipeline_cloud/tasks/sv_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def task_file_generation(self, output_dir, task_dir, sample_conf, param_conf, ru
for sample_name in sorted(sample_list_for_parse):
if sample_name in list(sample_conf.bam_tofastq.keys()) + list(sample_conf.fastq.keys()) + list(sample_conf.bam_import.keys()):

bam = sample_conf.bam_file[sample_name]
bam = sample_conf.bwa_bam_file[sample_name]
bam_dir = os.path.dirname(bam)
bam_file = os.path.basename(bam)

Expand Down