diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index a37f01d..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,37 +0,0 @@ - - -cmake_minimum_required (VERSION 2.6) -project(TIDDIT) - -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") - -add_definitions( -Wno-deprecated ) - -# set our library and executable destination dirs -set( EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin" ) - -# define compiler flags for all code -set( CMAKE_BUILD_TYPE Release ) - -include_directories("${PROJECT_SOURCE_DIR}/src") -include_directories("${PROJECT_SOURCE_DIR}/lib/") -include_directories("${PROJECT_SOURCE_DIR}/lib/bamtools/src") - - -# source code -file(GLOB TIDDIT_FILES - ${PROJECT_SOURCE_DIR}/src/TIDDIT.cpp - ${PROJECT_SOURCE_DIR}/src/data_structures/Translocation.cpp - ${PROJECT_SOURCE_DIR}/src/data_structures/findTranslocationsOnTheFly.cpp - ${PROJECT_SOURCE_DIR}/src/common.h - ${PROJECT_SOURCE_DIR}/src/data_structures/CoverageModule.cpp -) - - -add_subdirectory(lib) - - -# TIDDIT executable -add_executable(TIDDIT ${TIDDIT_FILES}) -target_link_libraries(TIDDIT ${ZLIB_LIBRARIES}) -target_link_libraries(TIDDIT BamTools) diff --git a/Dockerfile b/Dockerfile index 7035d14..b1b5f2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,39 +1,9 @@ -FROM python:3.8-slim - -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y \ - autoconf \ - automake \ - build-essential \ - cmake \ - libbz2-dev \ - libcurl4-gnutls-dev \ - liblzma-dev \ - libncurses5-dev \ - libssl-dev \ - make \ - unzip \ - wget \ - zlib1g-dev && \ - apt-get clean && \ - apt-get purge && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +FROM condaforge/mambaforge:24.9.2-0 WORKDIR /app -## Install samtools for cram processing -RUN wget --no-verbose https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 && \ - bunzip2 samtools-1.10.tar.bz2 && \ - tar -xf samtools-1.10.tar && \ - cd samtools-1.10 && \ - ./configure && \ - make all all-htslib && \ - make install install-htslib && \ - rm /app/samtools-1.10.tar - ## Set TIDDIT version -ARG TIDDIT_VERSION=2.12.1 +ARG TIDDIT_VERSION=3.9.0 ## Add some info LABEL base_image="python:3.8-slim" @@ -41,15 +11,17 @@ LABEL software="TIDDIT.py" LABEL software.version=${TIDDIT_VERSION} ## Download and extract +RUN conda install conda-forge::unzip +RUN conda install -c conda-forge pip gcc joblib +RUN conda install -c bioconda numpy cython pysam bwa + RUN wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-${TIDDIT_VERSION}.zip && \ unzip TIDDIT-${TIDDIT_VERSION}.zip && \ rm TIDDIT-${TIDDIT_VERSION}.zip ## Install -RUN cd TIDDIT-TIDDIT-${TIDDIT_VERSION} && \ - ./INSTALL.sh && \ - chmod +x /app/TIDDIT-TIDDIT-${TIDDIT_VERSION}/TIDDIT.py && \ - ln -s /app/TIDDIT-TIDDIT-${TIDDIT_VERSION}/TIDDIT.py /usr/local/bin +RUN cd TIDDIT-TIDDIT-${TIDDIT_VERSION} && \ + pip install -e . -ENTRYPOINT ["TIDDIT.py"] +ENTRYPOINT ["tiddit"] CMD ["--help"] diff --git a/README.md b/README.md index d669eab..c60a191 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ DESCRIPTION ============== TIDDIT: Is a tool to used to identify chromosomal rearrangements using Mate Pair or Paired End sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs. -TIDDIT searches for discordant reads and splti reads (supplementary alignments). The supplementary alignments are assembled and aligned using a fermikit-like workflow. +TIDDIT searches for discordant reads and split reads (supplementary alignments). TIDDIT also performs local assembly using a custom local de novo assembler. Next all signals (contigs, split-reads, and discordant pairs) are clustered using DBSCAN. The resulting clusters are filtered and annotated, and reported as SV depending on the statistics. TIDDIT has two analysis modules. The sv mode, which is used to search for structural variants. And the cov mode that analyse the read depth of a bam file and generates a coverage report. On a 30X human genome, the TIDDIT SV module typically completetes within 5 hours, and requires less than 10Gb ram. @@ -27,11 +27,11 @@ cd tiddit pip install -e . ``` -Next install fermi2, ropebwt2, and bwa, I recommend using conda: +Next install bwa, I recommend using conda: -conda install fermi2 ropebwt2 bwa +conda install bwa -You may also compile bwa, fermi2, and ropebwt2 yourself. Remember to add executables to path, or provide path through the command line parameters. +You may also compile bwa yourself. Remember to add executables to path, or provide path through the command line parameters. ``` tiddit --help tiddit --sv --help diff --git a/Singularity b/Singularity deleted file mode 100644 index bce6b42..0000000 --- a/Singularity +++ /dev/null @@ -1,36 +0,0 @@ -BootStrap: debootstrap -OSVersion: trusty -MirrorURL: http://us.archive.ubuntu.com/ubuntu/ - -%environment - SHELL=/bin/bash - PATH=/opt/anaconda/bin:${PATH} - - -%runscript - echo "This is what happens when you run the container..." - PATH=/opt/anaconda/bin:${PATH} - echo "This is what happens when you run the container..." - -%post - echo "Hello from inside the container" - sed -i 's/$/ universe/' /etc/apt/sources.list - apt-get update - apt-get upgrade - apt-get -y --force-yes install build-essential cmake make zlib1g-dev python python-dev python-setuptools git wget - - cd /root/ && wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - cd /root/ && chmod 700 ./Miniconda3-latest-Linux-x86_64.sh - cd /root/ && bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/anaconda/ - - export PATH=/opt/anaconda/bin:${PATH} - - - pip install numpy cython - - git clone https://github.com/SciLifeLab/TIDDIT.git - mv TIDDIT/* /bin/ - cd /bin/ && ./INSTALL.sh - chmod +x /bin/TIDDIT.py - - diff --git a/setup.py b/setup.py index e6d23e3..4343255 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,8 @@ "tiddit/tiddit_cluster.pyx", "tiddit/tiddit_coverage_analysis.pyx", "tiddit/tiddit_gc.pyx", + "tiddit/silverfish.pyx", + "tiddit/graphlib.pyx", "tiddit/tiddit_variant.pyx", "tiddit/tiddit_contig_analysis.pyx"]) else: @@ -25,7 +27,7 @@ setup( name = 'tiddit', - version = '3.8.0', + version = '3.9.0', url = "https://github.com/SciLifeLab/TIDDIT", diff --git a/tiddit/DBSCAN.py b/tiddit/DBSCAN.py index 7ac5510..68411d1 100644 --- a/tiddit/DBSCAN.py +++ b/tiddit/DBSCAN.py @@ -41,7 +41,7 @@ def x_coordinate_clustering(data,epsilon,m): distances=[] current=data[i,:] - points=data[i+1:i+m,:] + points=data[i+1:i+m+1,:] #print points distances=[] diff --git a/tiddit/__main__.py b/tiddit/__main__.py index 5b0a5e9..c058686 100644 --- a/tiddit/__main__.py +++ b/tiddit/__main__.py @@ -18,7 +18,7 @@ import tiddit.tiddit_gc as tiddit_gc def main(): - version="3.8.0" + version="3.9.0" parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False) parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true") parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true") @@ -27,9 +27,13 @@ def main(): if args.sv == True: parser = argparse.ArgumentParser("""tiddit --sv --bam inputfile [-o prefix] --ref ref.fasta""") + #required parameters parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true") parser.add_argument('--force_overwrite' , help="force the analysis and overwrite any data in the output folder", required=False, action="store_true") parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)") + parser.add_argument('--ref', type=str, help="reference fasta",required=True) + + #related to sv calling parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)") parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)") parser.add_argument('-d', type=str,help="expected reads orientations, possible values \"innie\" (-> <-) or \"outtie\" (<- ->). Default: major orientation within the dataset") @@ -42,19 +46,28 @@ def main(): parser.add_argument('-c', type=float, help="average coverage, overwrites the estimated average coverage (useful for exome or panel data)") parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set >= 2") parser.add_argument('-s', type=int,default=25000000, help="Number of reads to sample when computing library statistics(default=25000000)") - parser.add_argument('-z', type=int,default=50, help="minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)") parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)") parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)") - parser.add_argument('--ref', type=str, help="reference fasta",required=True) - parser.add_argument('--bwa', type=str,default="bwa", help="path to bwa executable file(default=bwa)") - parser.add_argument('--fermi2', type=str,default="fermi2", help="path to fermi2 executable file (default=fermi2)") - parser.add_argument('--ropebwt2', type=str , default="ropebwt2", help="path to ropebwt2 executable file (default=ropebwt2)") - parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref") - #parser.add_argument('--skip_index', action="store_true", help="Do not generate the csi index") + + #stuff related to filtering parser.add_argument('--p_ratio', type=float,default=0.1, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)") parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=0.1)") parser.add_argument('--max_coverage', type=float,default=4, help="filter call if X times higher than chromosome average coverage (default=4)") parser.add_argument('--min_contig', type=int,default=10000, help="Skip calling on small contigs (default < 10000 bp)") + parser.add_argument('-z', type=int,default=50, help="minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)") + + #assembly related stuff + parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require bwa and bwa indexed ref, and will complete quicker") + parser.add_argument('--bwa', type=str,default="bwa", help="path to bwa executable file(default=bwa)") + parser.add_argument('--min_clip', type=int,default=4, help="Minimum clip reads to initiate local assembly of a region(default=4)") + parser.add_argument('--padding', type=int,default=4, help="Extend the local assembly by this number of bases (default=200bp)") + parser.add_argument('--min_pts_clips', type=int,default=3, help="min-pts parameter for the clustering of candidates for local assembly (default=3)") + parser.add_argument('--max_assembly_reads', type=int,default=100000, help="Skip assembly of regions containing too many reads (default=10000 reads)") + parser.add_argument('--max_local_assembly_region', type=int,default=2000, help="maximum size of the clip read cluster for being considered a local assembly candidate (default=2000 bp)") + parser.add_argument('--min_anchor_len', type=int,default=60, help="minimum mapped bases to be considered a clip read (default=60 bp)") + parser.add_argument('--min_clip_len', type=int,default=25, help="minimum clipped bases to be considered a clip read (default=25 bp)") + parser.add_argument('--min_contig_len', type=int,default=200, help="minimum contig length for SV analysis (default=200 bp)") + parser.add_argument('-k', type=int,default=91, help="kmer lenght used by the local assembler (default=91 bp)") args= parser.parse_args() if args.l < 2: @@ -63,15 +76,7 @@ def main(): if not args.skip_assembly: if not os.path.isfile(args.bwa) and not shutil.which(args.bwa): - print("error, BWA executable missing, add BWA to path, or specify using --bwa") - quit() - - if not os.path.isfile(args.fermi2) and not shutil.which(args.fermi2): - print("error, fermi2 executable missing, add fermi2 to path, or specify using --fermi2") - quit() - - if not os.path.isfile(args.ropebwt2) and not shutil.which(args.ropebwt2): - print("error, ropebwt2 executable missing, add ropebwt2 to path, or specify using --ropebwt2") + print("error, BWA executable missing, add BWA to path, or specify using --bwa, or skip local assembly (--skip_assembly)") quit() if not glob.glob("{}*.bwt*".format(args.ref)): @@ -150,7 +155,7 @@ def main(): t=time.time() - coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False) + coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False,args.min_anchor_len,args.min_clip_len) print("extracted signals in:") print(t-time.time()) diff --git a/tiddit/graphlib.pyx b/tiddit/graphlib.pyx new file mode 100644 index 0000000..d97a66b --- /dev/null +++ b/tiddit/graphlib.pyx @@ -0,0 +1,105 @@ +class graph: + + def __init__(self): + self.predecessors={} + self.sucessors={} + self.kmers={} + self.vertices={} + self.vertice_set=set([]) + self.in_branch_points=set([]) + self.out_branch_points=set([]) + self.starting_points=set([]) + self.end_points=set([]) + + #add node to the graph + def add_kmer(self,kmer,read): + + if not kmer in self.kmers: + self.kmers[kmer]=set([]) + self.predecessors[kmer]=set([]) + self.sucessors[kmer]=set([]) + self.starting_points.add(kmer) + self.end_points.add(kmer) + + self.kmers[kmer].add(read) + + #add vertices between nodes + def add_vertice(self,kmer1,kmer2,read): + + self.add_kmer(kmer1,read) + self.add_kmer(kmer2,read) + + if not kmer1 in self.vertices: + self.vertices[kmer1]={} + + if kmer1 in self.end_points: + self.end_points.remove(kmer1) + + if not kmer2 in self.vertices[kmer1]: + self.vertices[kmer1][kmer2]=set([]) + + if kmer2 in self.starting_points: + self.starting_points.remove(kmer2) + + self.vertices[kmer1][kmer2].add(read) + + self.vertice_set.add((kmer1,kmer2)) + + self.sucessors[kmer1].add(kmer2) + if len(self.sucessors[kmer1]) > 1: + self.in_branch_points.add(kmer1) + + self.predecessors[kmer2].add(kmer1) + if len(self.predecessors[kmer2]) > 1: + self.out_branch_points.add(kmer2) + + def delete_vertice(self,kmer1,kmer2): + + if kmer1 in self.vertices: + if kmer2 in self.vertices[kmer1]: + del self.vertices[kmer1][kmer2] + + if len(self.vertices[kmer1]) == 0: + del self.vertices[kmer1] + + if kmer1 in self.in_branch_points: + if len(self.sucessors[kmer1]) < 3: + self.in_branch_points.remove(kmer1) + + if kmer1 in self.sucessors: + if kmer2 in self.sucessors[kmer1]: + self.sucessors[kmer1].remove(kmer2) + + if not self.sucessors[kmer1]: + self.end_points.add(kmer1) + + if kmer2 in self.out_branch_points: + if len(self.predecessors[kmer2]) < 3: + self.out_branch_points.remove(kmer2) + + if kmer1 in self.predecessors[kmer2]: + self.predecessors[kmer2].remove(kmer1) + + if not len(self.predecessors[kmer2]): + self.starting_points.add(kmer2) + + if (kmer1,kmer2) in self.vertice_set: + self.vertice_set.remove((kmer1,kmer2)) + + def delete_kmer(self,kmer): + if kmer in self.kmers: + del self.kmers[kmer] + + sucessors=set([]) + for sucessor in self.sucessors[kmer]: + sucessors.add(sucessor) + + predecessors=set([]) + for predecessor in self.predecessors[kmer]: + predecessors.add(predecessor) + + for predecessor in predecessors: + self.delete_vertice(predecessor,kmer) + + for sucessor in sucessors: + self.delete_vertice(kmer,sucessor) diff --git a/tiddit/silverfish.pyx b/tiddit/silverfish.pyx new file mode 100644 index 0000000..7aab2fa --- /dev/null +++ b/tiddit/silverfish.pyx @@ -0,0 +1,231 @@ +import time +import sys +import graphlib +import copy + +def build_kmer_hist(seq,kmer_hist,k): + seq_len=len(seq) + kmers=[] + for i in range(0,seq_len-k+1): + + kmer=seq[i:i+k] + if len(kmer) < k: + break + kmers.append(kmer) + + if not kmer in kmer_hist: + kmer_hist[kmer]=0 + + kmer_hist[kmer]+=1 + + return(kmers,kmer_hist) + +def find_chain(graph,start,ends): + chain=[start] + current_node=start + if start in ends: + return(chain) + + while True: + current_node=graph.sucessors[current_node] + + if not current_node or len(current_node) > 1 or current_node == start or current_node in ends: + return(chain) + current_node=list(current_node)[0] + chain.append(current_node) + if current_node in ends: + return(chain) + +def drop_kmers(graph,min_support): + kmers=list(graph.kmers.keys()) + for kmer in kmers: + if len(graph.kmers[kmer]) < min_support: + graph.delete_kmer(kmer) + return(graph) + +def trim_edges(graph,min_weight): + edge_list=list(graph.vertice_set) + for edge in edge_list: + if len(graph.vertices[edge[0]][edge[1]]) < min_weight: + graph.delete_vertice(edge[0],edge[1]) + return(graph) + +def remove_tips(graph,min_tip_length): + branch_start=graph.in_branch_points + branch_end=graph.out_branch_points + starting_point=graph.starting_points + + switches=branch_end.union(branch_start) + for start in starting_point.union(branch_start,branch_end): + chains=[] + for node in graph.sucessors[start]: + chains.append([start]+find_chain(graph,node,switches)) + + for chain in chains: + if len(chain) < 20 and chain[-1] in graph.end_points: + for node in chain: + graph.delete_kmer(node) + + return(graph) + + +def chain_typer(chain,graph): + + if chain[0] in graph.starting_points: + return("starting_point") + elif chain[-1] in graph.end_points: + return("end_point") + + elif chain[0] in graph.in_branch_points: + if chain[-1] in graph.out_branch_points: + return("in_out") + elif chain[-1] in graph.in_branch_points: + return("in_in") + + elif chain[0] in graph.out_branch_points: + if chain[-1] in graph.out_branch_points: + return("out_out") + + elif chain[-1] in graph.in_branch_points: + return("out_in") + + return("unknown") + +def forward_scaffold(scaffold,chains,graph,chain_numbers): + results=[] + + for i in range(0,len(chains)): + if i in chain_numbers: + continue + + if chains[i][0][0] == chains[scaffold][0][-1]: + r=forward_scaffold(i,chains,graph,set([i]) | chain_numbers ) + + for j in range(0,len(r)): + results.append( [ chains[scaffold][0]+r[j][0][1:],r[j][1] | set([scaffold]) ] ) + + if not results: + results=[ [chains[scaffold][0], set([scaffold]) ] ] + + return(results) + +def backward_scaffold(scaffold,chains,graph,chain_numbers): + results=[] + + for i in range(0,len(chains)): + if i in chain_numbers: + continue + + if chains[i][0][-1] == chains[scaffold][0][0]: + r=backward_scaffold(i,chains,graph,set([i]) | chain_numbers ) + + for j in range(0,len(r)): + results.append( [ r[j][0]+chains[scaffold][0][1:],r[j][1] | set([scaffold]) ] ) + + if not results: + results=[ [chains[scaffold][0], set([scaffold]) ] ] + + return(results) + +def main(reads,k,min_support): + + time_all=time.time() + + kmers={} + time_kmerize=time.time() + graph = graphlib.graph() + + kmer_hist={} + for read in reads: + if len(reads[read]) < k: + continue + + read_kmers,kmer_hist=build_kmer_hist(reads[read],kmer_hist,k) + kmers[read]=read_kmers + + for read in kmers: + if len(reads[read]) < k+1: + continue + + + for i in range(1,len(kmers[read])): + + if kmer_hist[kmers[read][i-1]] < min_support and kmer_hist[kmers[read][i]] < min_support: + continue + + if kmer_hist[kmers[read][i]] < min_support and kmer_hist[kmers[read][i-1]] >= min_support: + graph.add_kmer(kmers[read][i-1],read) + + elif kmer_hist[kmers[read][i]] >= min_support and kmer_hist[kmers[read][i-1]] < min_support: + graph.add_kmer(kmers[read][i],read) + + if kmer_hist[kmers[read][i]] >= min_support and kmer_hist[kmers[read][i]] >= min_support: + graph.add_vertice(kmers[read][i-1],kmers[read][i],read) + + if not reads: + return([]) + + graph=drop_kmers(graph,min_support) + graph=trim_edges(graph,min_support) + graph=remove_tips(graph,10) + + branch_start=graph.in_branch_points + branch_end=graph.out_branch_points + starting_point=graph.starting_points + + chains=[] + switches=branch_end.union(branch_start) + for start in starting_point.union(branch_start,branch_end): + for node in graph.sucessors[start]: + + chain=[start]+find_chain(graph,node,switches) + chain_type=chain_typer(chain,graph) + chains.append([chain,chain_type]) + + scaffolds=[] + for i in range(0,len(chains)): + chain=chains[i][0] + start=chain[0] + end=chain[-1] + + scaffold=[] + + if chains[i][1] == "end_point": + results=backward_scaffold(i,chains,graph,set([i]) ) + scaffolds+=results + + elif chains[i] == "start_point": + results=forward_scaffold(i,chains,graph,set([i]) ) + scaffolds+=results + else: + forward=forward_scaffold(i,chains,graph,set([i]) ) + for forward_result in forward: + backward_result=backward_scaffold(i,chains,graph,forward_result[1] ) + for result in backward_result: + scaffolds.append([ result[0]+forward_result[0][len(chains[i][0])-1:],forward_result[1] | result[1]]) + + results=[] + for i in range(0,len(scaffolds)): + + skip=False + for j in range(0,len(scaffolds)): + if j ==i or j < i: + continue + if not len(scaffolds[i][-1]-scaffolds[j][-1]): + skip=True + + if skip: + continue + + scaffolded_chains=list(map(str,scaffolds[i][-1])) + + out=[] + for j in range(1,len(scaffolds[i][0])): + out.append(scaffolds[i][0][j][-1]) + + results.append(scaffolds[i][0][0]+"".join(out)) + return(results) + +min_branch_length=2 +min_overlap=0.2 +max_overlap=0.8 diff --git a/tiddit/tiddit_cluster.pyx b/tiddit/tiddit_cluster.pyx index aa80d40..e1dedda 100644 --- a/tiddit/tiddit_cluster.pyx +++ b/tiddit/tiddit_cluster.pyx @@ -150,14 +150,12 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi candidates[chrA][chrB]={} positions[chrA][chrB]=numpy.array(sorted(positions[chrA][chrB],key=lambda l:l[0])) - #print(positions[:,[0,1]]) clusters=DBSCAN.main(positions[chrA][chrB],epsilon,m) positions[chrA][chrB]=list(positions[chrA][chrB]) cluster_pos=[] for i in range(0,len(positions[chrA][chrB])): cluster_pos.append(list(positions[chrA][chrB][i])+[clusters[i]] ) - #positions[i].append(clusters[i]) cluster_pos= sorted(cluster_pos,key=lambda l:l[2]) n_ctg_clusters=0 diff --git a/tiddit/tiddit_contig_analysis.pyx b/tiddit/tiddit_contig_analysis.pyx index ae31441..de5815e 100644 --- a/tiddit/tiddit_contig_analysis.pyx +++ b/tiddit/tiddit_contig_analysis.pyx @@ -1,9 +1,17 @@ import pysam +from subprocess import Popen, PIPE, DEVNULL +from joblib import Parallel, delayed + +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment + import numpy import math import os +import time + import tiddit.DBSCAN as DBSCAN +import tiddit.silverfish as silverfish import tiddit.tiddit_signal as tiddit_signal @@ -68,15 +76,72 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size): current_bp+=read.cigartuples[i][1] f=open("{}_tiddit/contigs_{}.tab".format(prefix,sample_id),"w") + positions=set([]) for chrA in split_contigs: for chrB in split_contigs[chrA]: for fragment in split_contigs[chrA][chrB]: + p=(chrA,chrB,split_contigs[chrA][chrB][fragment][0],split_contigs[chrA][chrB][fragment][2]) + if p in positions: + continue + f.write("{}\t{}\t{}\t{}\n".format(fragment,chrA,chrB,"\t".join(map(str, split_contigs[chrA][chrB][fragment] ))) ) + positions.add(p) f.close() +def local_assembly(args,sample_id,prefix,regions,chr): + + if os.path.isfile("{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr)): + os.remove("{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr)) + + cdef AlignmentFile samfile = AlignmentFile(args.bam, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,sample_id)) + mag=open( "{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr) ,"w") + contig=1 + for region in regions[chr]: + + n_reads=0 + proper=0 + low_mapq=0 + + if region[2]-region[1] > args.max_local_assembly_region: + continue + + reads={} + for read in samfile.fetch(region[0], region[1], region[2]): + if read.is_supplementary or read.is_duplicate or read.is_secondary: + continue + n_reads+=1 + if read.mapq < 10: + low_mapq+=1 + if read.is_proper_pair: + proper+=1 + + reads[str(n_reads)]=read.query_sequence + if n_reads > 50000: + break + + + if n_reads > args.max_assembly_reads: + continue + + if low_mapq/n_reads > 0.25 or proper/n_reads < 0.75: + continue + + results=silverfish.main(reads,args.k,args.min_clip) + del reads + + for result in results: + if len(result) > args.min_contig_len: + mag.write(f">{chr}_{region[1]}_{region[2]}_{contig}\n") + mag.write(result+"\n") + contig+=1 + + mag.close() + return( "{}_tiddit/clips/clips.fa.assembly.{}.clean.mag".format(prefix,chr) ) + def main(prefix,sample_id,library,contigs,coverage_data,args): + clips={} c=[] @@ -95,10 +160,20 @@ def main(prefix,sample_id,library,contigs,coverage_data,args): clips[chr][1].append([pos,0]) c=[] - f=open("{}_tiddit/clips.fa".format(prefix),"w") + regions={} + + + assembly_l=args.min_pts_clips + for chr in clips: - - clusters,cluster_id = DBSCAN.x_coordinate_clustering(numpy.array(clips[chr][1]),50,args.l) + regions[chr]=[] + + + l=assembly_l + if library[ "avg_coverage_{}".format(chr) ]/library["avg_coverage"] > 5: + l=args.l*int(round(library[ "avg_coverage_{}".format(chr) ]/library["avg_coverage"]/2.0)) + + clusters,cluster_id = DBSCAN.x_coordinate_clustering(numpy.array(clips[chr][1]),50,l) cluster_stats={} for i in range(0,len(clusters)): @@ -109,27 +184,31 @@ def main(prefix,sample_id,library,contigs,coverage_data,args): cluster_stats[clusters[i]][0]+=1 cluster_stats[clusters[i]][1].append( clips[chr][1][i][0] ) - for i in range(0,len(clusters)): - if clusters[i] == -1: - continue - if cluster_stats[clusters[i]][0] < args.r: - continue + for cluster in cluster_stats: - if cluster_stats[clusters[i]][0] > 2*library[ "avg_coverage_{}".format(chr) ]: + if cluster_stats[cluster][0] < args.min_clip: continue - clip_coverage=coverage_data[chr][ int(math.floor(clips[chr][1][i][0]/50.0)) ] - if clip_coverage > args.max_coverage/2*library[ "avg_coverage_{}".format(chr) ]: + clip_coverage=max(coverage_data[chr][ int(math.floor(min(cluster_stats[cluster][1])/50.0)):int(math.floor(max(cluster_stats[cluster][1])/50.0))+1 ]) + + if clip_coverage/library[ "avg_coverage_{}".format(chr) ] > args.max_coverage: continue - f.write( clips[chr][0][i].strip() +"\n") + regions[chr].append([chr,min(cluster_stats[cluster][1] )-args.padding,max(cluster_stats[cluster][1])+args.padding]) + + if regions[chr][-1][1] < 1: + regions[chr][-1][1]=1 - f.close() del clips - os.system("{} -dNCr {}_tiddit/clips.fa | {} assemble -t {} -l 81 - > {}_tiddit/clips.fa.assembly.mag".format(args.ropebwt2,prefix,args.fermi2,args.threads,prefix)) - os.system("{} simplify -COS -d 0.8 {}_tiddit/clips.fa.assembly.mag 1> {}_tiddit/clips.fa.assembly.clean.mag 2> /dev/null".format(args.fermi2,prefix,prefix)) + contigs=Parallel(n_jobs=args.threads,timeout=99999)( delayed(local_assembly)(args,sample_id,prefix,regions,chr) for chr in regions) + + mag=open(f"{prefix}_tiddit/clips.fa.assembly.clean.mag","w") + for contig in contigs: + for line in open(contig): + mag.write(line.rstrip()+"\n") + mag.close() + os.system("{} mem -t {} -x intractg {} {}_tiddit/clips.fa.assembly.clean.mag 1> {}_tiddit/clips.sam 2> /dev/null".format(args.bwa,args.threads,args.ref,prefix,prefix)) read_contigs("{}_tiddit/clips.sam".format(prefix) , prefix, sample_id, args.z) - diff --git a/tiddit/tiddit_signal.pyx b/tiddit/tiddit_signal.pyx index d6b82c1..fa631b4 100644 --- a/tiddit/tiddit_signal.pyx +++ b/tiddit/tiddit_signal.pyx @@ -144,7 +144,7 @@ def SA_analysis(read,min_q,tag,reference_name): return(split) -def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size,skip_index): +def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size,skip_index,int min_anchor_len,int min_clip_len): print("Collecting signals on contig: {}".format(chromosome)) bam_index="{}_tiddit/{}.csi".format(prefix,sample_id) @@ -184,17 +184,18 @@ def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int ma if read_supplementary or read.is_secondary: continue - if read_mapq > 1: + + if read_mapq < min_q: + continue + + if ( abs(read.isize) < max_ins and mate_chromosome == read_chromosome ): cigar_tuple=read.cigartuples - if (cigar_tuple[0][0] == 4 and cigar_tuple[0][1] > 10) and (cigar_tuple[-1][0] == 0 and cigar_tuple[-1][1] > 30) and len(cigar_tuple) < 7: + if (cigar_tuple[0][0] == 4 and cigar_tuple[0][1] > min_clip_len) and (cigar_tuple[-1][0] == 0 and cigar_tuple[-1][1] > min_anchor_len): clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"]) - elif cigar_tuple[-1][0] == 4 and cigar_tuple[-1][1] > 10 and (cigar_tuple[0][0] == 0 and cigar_tuple[0][1] > 30) and len(cigar_tuple) < 7: + elif cigar_tuple[-1][0] == 4 and cigar_tuple[-1][1] > min_clip_len and (cigar_tuple[0][0] == 0 and cigar_tuple[0][1] > min_anchor_len): clips.append([">{}|{}|{}\n".format(read.query_name,read_chromosome,read_position+1),read.query_sequence+"\n"]) - if read_mapq < min_q: - continue - if read.has_tag("SA"): split=SA_analysis(read,min_q,"SA",read_chromosome) if split: @@ -226,14 +227,14 @@ def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int ma return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) ) -def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig,skip_index): +def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig,skip_index,int min_anchor_len,int min_clip_len): cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref) bam_header=samfile.header samfile.close() cdef int bin_size=50 - cdef str file_type="wig" - cdef str outfile=prefix+".tiddit_coverage.wig" + file_type="wig" + outfile=prefix+".tiddit_coverage.wig" cdef long t_tot=0 @@ -255,7 +256,7 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i splits[chrA["SN"]][chrB["SN"]]={} t=time.time() - res=Parallel(n_jobs=threads,timeout=99999)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size,skip_index) for chromosome in chromosomes ) + res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size,skip_index,min_anchor_len,min_clip_len) for chromosome in chromosomes ) chromosomes=set(chromosomes) for i in range(0,len(res)): diff --git a/tiddit/tiddit_variant.pyx b/tiddit/tiddit_variant.pyx index d70be7e..1f00bfc 100644 --- a/tiddit/tiddit_variant.pyx +++ b/tiddit/tiddit_variant.pyx @@ -367,8 +367,9 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar if n_contigs: for c in sv_clusters[chrA][chrB][cluster]["contigs"]: - if "_" in c: - c=c.split("_")[0] + if "_d_" in c: + c=c.split("_d_")[0] + ctgs=[ contig_seqs[c] ] info+=["CTG={}".format("|".join(ctgs) )] @@ -451,9 +452,9 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar if n_contigs: for c in sv_clusters[chrA][chrB][cluster]["contigs"]: - if "_" in c: - c=c.split("_")[0] - + if "_d_" in c: + c=c.split("_d_")[0] + ctgs=[ contig_seqs[c] ] info+=["CTG={}".format("|".join(ctgs) )] @@ -538,13 +539,20 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl if not args.skip_assembly: for line in open("{}_tiddit/clips.fa.assembly.clean.mag".format(args.o)): - if not new_seq and line[0] == "@" and "\t" in line: - name=line.split("\t")[0][1:] - new_seq=True - - elif new_seq: + if line[0] == ">": + name=line[1:].rstrip() + else: contig_seqs[name]=line.strip("\n") - new_seq=False + + #if not new_seq and line[0] == "@" and "\t" in line: + # name=line.split("\t")[0][1:] + # new_seq=True + + #elif new_seq: + # contig_seqs[name]=line.strip("\n") + # new_seq=False + + variants={} for chrA in sv_clusters: variants[chrA]=[]