CPC2 init

gao-lab · Mar 15, 2018 · 481d361 · 481d361
1 parent 528a411
commit 481d361
Show file tree

Hide file tree

Showing 8 changed files with 3,758 additions and 0 deletions.
diff --git a/00README b/00README
@@ -0,0 +1,44 @@
+* 2017-03-6 21:00, Yu-jian Kang
+
+Here is a temporary package of CPC2.
+
+
+1) Pre-requisite:
+
+a. Biopython package: a local version could be downloaded from
+http://biopython.org/wiki/Download
+
+2) Install
+
+a. Unpack the tarball:
+
+tom@linux$ gzip -dc CPC2-beta.tar.gz | tar xf -
+
+b. Build third-part packages: 
+
+tom@linux$ cd CPC2-beta
+tom@linux$ export CPC_HOME="$PWD"
+tom@linux$ cd libs/libsvm
+tom@linux$ gzip -dc libsvm-3.18.tar.gz | tar xf -
+tom@linux$ cd libsvm-3.18
+tom@linux$ make clean && make
+
+3) Run the predict
+
+tom@linux$ cd $CPC_HOME
+tom@linux$ bin/CPC2.py -i (input_seq) -o (result_in_table)
+
+example:
+tom@linux$ bin/CPC2.py -i data/example.fa -o example_output
+
+4) Output result
+
+Result in table format (delimited by tab):
+#ID peptide_length Fickett_score isoelectric_point ORF_integrity coding_probability coding_label
+
+
+ See the website for tutorial and more details. (http://cpc2.cbi.pku.edu.cn)
+
+ This is a beta version of CPC2, if have any questions please report to us.
+
+ Contact: [email protected]
diff --git a/bin/CPC2.py b/bin/CPC2.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import re
+import commands
+import time
+from optparse import OptionParser,OptionGroup
+
+import numpy as np
+from Bio.Seq import Seq
+from Bio.SeqUtils import ProtParam
+
+import seqio
+
+def __main():
+ start_time = time.time()
+ usage = "usage: %prog [options] -i input.fasta -o output_file"
+ description = "Contact: Kang Yujian <[email protected]>"
+ parser = OptionParser(usage,version="%prog 0.1",description = description)
+ Common_group = OptionGroup(parser,"Common Options")
+ Common_group.add_option("-i",dest="fasta",help="input sequence in fasta format [Required]",metavar="FILE",type="string",default=None)
+ Common_group.add_option("-o",dest="outfile",help="output file [Default: cpc2output.txt]",metavar="FILE",type="string",default="cpc2output.txt")
+ Common_group.add_option("-r",dest="reverse",help="also check the reverse strand [Default: FALSE]",action="store_true")
+ parser.add_option_group(Common_group)
+ (options, args) = parser.parse_args()
+ if options.fasta == None:
+ parser.print_help()
+ return -1
+ else:
+ if not os.path.isfile(options.fasta):
+ sys.stderr.write("[ERROR] %s is not a file\n"%options.fasta)
+ return -1
+ if options.reverse:
+ strand = "-"
+ else:
+ strand = "+"
+ if calculate_potential(options.fasta,strand,options.outfile):
+ return 1
+ sys.stderr.write("[INFO] cost time: %ds\n"%(time.time()-start_time))
+ return 0
+
+class FindCDS:
+ '''
+ Find the most like CDS in a given sequence 
+ The most like CDS is the longest ORF found in the sequence
+ When having same length, the upstream ORF is printed
+ modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar
+ '''
+ def __init__(self,seq):
+ self.seq = seq
+ self.result = (0,0,0,0,0)
+ self.longest = 0
+ self.basepair = {"A":"T","T":"A","U":"A","C":"G","G":"C","N":"N","X":"X"}
+
+ def _reversecompliment(self):
+ return "".join(self.basepair[base] for base in self.seq)[::-1]
+
+ def get_codons(self,frame_number):
+ '''
+ Record every nucleotide triplet and its coordinate position for input sequence in one frame
+ '''
+ coordinate = frame_number
+ while coordinate + 3 <= len(self.seq):
+ yield (self.seq[coordinate:coordinate+3], coordinate)
+ coordinate += 3 
+
+ def find_longest_in_one(self,myframe,direction,start_codon,stop_codon):
+ '''
+ find the longest ORF in one reading myframe
+ '''
+ triplet_got = self.get_codons(myframe) 
+ starts = start_codon
+ stops = stop_codon
+ '''
+ Extend sequence by triplet after start codon encountered
+ End ORF extension when stop codon encountered
+ '''
+ while True:
+ try: 
+ codon,index = triplet_got.next()
+ except StopIteration:
+ break 
+ if codon in starts and codon not in stops:
+ '''
+ find the ORF start
+ '''
+ orf_start = index
+ end_extension = False
+ while True:
+ try: 
+ codon,index = triplet_got.next()
+ except StopIteration:
+ end_extension = True
+ integrity = -1
+ if codon in stops:
+ integrity = 1
+ end_extension = True
+ if end_extension:
+ orf_end = index + 3
+ Length = (orf_end - orf_start)
+ if Length > self.longest:
+ self.longest = Length
+ self.result = [direction,orf_start,orf_end,Length,integrity]
+ if Length == self.longest and orf_start < self.result[1]:
+ '''
+ if ORFs have same length, return the one that if upstream
+ '''
+ self.result = [direction,orf_start,orf_end,Length,integrity]
+ break
+
+ def longest_orf(self,direction,start_codon={"ATG":None}, stop_codon={"TAG":None,"TAA":None,"TGA":None}):
+ return_orf = ""
+ for frame in range(3):
+ self.find_longest_in_one(frame,"+",start_codon,stop_codon)
+ return_orf = self.seq[self.result[1]:self.result[2]][:]
+ start_coordinate = self.result[1]
+ strand_direction = "+"
+ orf_integrity = self.result[4]
+ '''
+ Also check reverse chain if -r is chosen
+ '''
+ if direction == "-":
+ self.seq = self._reversecompliment()
+ for frame in range(3):
+ self.find_longest_in_one(frame,"-",start_codon,stop_codon)
+ if self.result[0] == "-":
+ return_orf = self.seq[self.result[1]:self.result[2]][:]
+ start_coordinate = self.result[1]
+ strand_direction = "-"
+ orf_integrity = self.result[4]
+ return return_orf,start_coordinate,strand_direction,orf_integrity
+
+
+class Fickett:
+ '''
+ calculate Fickett TESTCODE for full sequence
+ NAR 10(17) 5303-531
+ modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar 
+ '''
+ def __init__(self):
+ '''new compiled Fickett look-up table'''
+ self.position_parameter = [1.9,1.8,1.7,1.6,1.5,1.4,1.3,1.2,1.1,0.0]
+ self.content_parameter = [0.33,0.31,0.29,0.27,0.25,0.23,0.21,0.19,0.17,0]
+ self.position_probability = {
+ "A":[0.51,0.55,0.57,0.52,0.48,0.58,0.57,0.54,0.50,0.36],
+ "C":[0.29,0.44,0.55,0.49,0.52,0.60,0.60,0.56,0.51,0.38],
+ "G":[0.62,0.67,0.74,0.65,0.61,0.62,0.52,0.41,0.31,0.17],
+ "T":[0.51,0.60,0.69,0.64,0.62,0.67,0.58,0.48,0.39,0.24],
+ }
+ self.position_weight = {"A":0.062,"C":0.093,"G":0.205,"T":0.154}
+ self.content_probability = {
+ "A":[0.40,0.55,0.58,0.58,0.52,0.48,0.45,0.45,0.38,0.19],
+ "C":[0.50,0.63,0.59,0.50,0.46,0.45,0.47,0.56,0.59,0.33],
+ "G":[0.21,0.40,0.47,0.50,0.52,0.56,0.57,0.52,0.44,0.23],
+ "T":[0.30,0.49,0.56,0.53,0.48,0.48,0.52,0.57,0.60,0.51]
+ }
+ self.content_weight = {"A":0.084,"C":0.076,"G":0.081,"T":0.055}
+
+
+ def look_up_position_probability(self,value, base):
+ '''
+ look up positional probability by base and value
+ '''
+ if float(value) < 0:
+ return None
+ for idx,val in enumerate (self.position_parameter):
+ if (float(value) >= val):
+ return float(self.position_probability[base][idx]) * float(self.position_weight[base])
+
+ def look_up_content_probability(self,value, base):
+ '''
+ look up content probability by base and value
+ '''
+ if float(value) < 0:
+ return None
+ for idx,val in enumerate (self.content_parameter):
+ if (float(value) >= val):
+ return float(self.content_probability[base][idx]) * float(self.content_weight[base])
+
+ def fickett_value(self,dna):
+ '''
+ calculate Fickett value from full RNA transcript sequence
+ '''
+ if len(dna) < 2:
+ return 0
+ fickett_score=0
+ dna=dna
+ total_base = len(dna)
+ A_content = float(dna.count("A"))/total_base
+ C_content = float(dna.count("C"))/total_base
+ G_content = float(dna.count("G"))/total_base
+ T_content = float(dna.count("T"))/total_base
+
+ phase_0 = dna[::3]
+ phase_1 = dna[1::3]
+ phase_2 = dna[2::3]
+
+ phase_0_A = phase_0.count("A")
+ phase_1_A = phase_1.count("A")
+ phase_2_A = phase_2.count("A")
+ phase_0_C = phase_0.count("C")
+ phase_1_C = phase_1.count("C")
+ phase_2_C = phase_2.count("C")
+ phase_0_G = phase_0.count("G")
+ phase_1_G = phase_1.count("G")
+ phase_2_G = phase_2.count("G")
+ phase_0_T = phase_0.count("T")
+ phase_1_T = phase_1.count("T")
+ phase_2_T = phase_2.count("T")
+
+ A_content = float(phase_0_A + phase_1_A + phase_2_A)/total_base
+ C_content = float(phase_0_C + phase_1_C + phase_2_C)/total_base
+ G_content = float(phase_0_G + phase_1_G + phase_2_G)/total_base
+ T_content = float(phase_0_T + phase_1_T + phase_2_T)/total_base
+ A_position= np.max([phase_0_A,phase_1_A,phase_2_A])/(np.min([phase_0_A,phase_1_A,phase_2_A]) +1.0)
+ C_position= np.max([phase_0_C,phase_1_C,phase_2_C])/(np.min([phase_0_C,phase_1_C,phase_2_C]) +1.0)
+ G_position= np.max([phase_0_G,phase_1_G,phase_2_G])/(np.min([phase_0_G,phase_1_G,phase_2_G]) +1.0)
+ T_position= np.max([phase_0_T,phase_1_T,phase_2_T])/(np.min([phase_0_T,phase_1_T,phase_2_T]) +1.0)
+
+ fickett_score += self.look_up_content_probability(A_content,"A")
+ fickett_score += self.look_up_content_probability(C_content,"C")
+ fickett_score += self.look_up_content_probability(G_content,"G")
+ fickett_score += self.look_up_content_probability(T_content,"T")
+
+ fickett_score += self.look_up_position_probability(A_position,"A")
+ fickett_score += self.look_up_position_probability(C_position,"C")
+ fickett_score += self.look_up_position_probability(G_position,"G")
+ fickett_score += self.look_up_position_probability(T_position,"T")
+
+ return fickett_score
+
+#===================
+
+def mRNA_translate(mRNA):
+ return Seq(mRNA).translate()
+
+def protein_param(putative_seqprot):
+ return putative_seqprot.isoelectric_point()
+
+def calculate_potential(fasta,strand,outfile):
+ '''
+ Calculate three features: putative peptide length,pI and Fickett
+ And assess coding potential based on SVM model
+ '''
+ strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I)
+ ptU = re.compile("U",re.I)
+ ftmp_feat = file(outfile + ".feat","w")
+ ftmp_svm = file(outfile + ".tmp.1","w")
+ ftmp_result = file(outfile,"w")
+ ftmp_result.write("\t".join(map(str,["#ID","transcript_length","peptide_length","Fickett_score","pI","ORF_integrity","coding_probability","label"]))+"\n")
+ ftmp_result.close()
+ fickett_obj = Fickett()
+ for seq in seqio.fasta_read(fasta):
+ seqid = seq.id
+ seqRNA = ptU.sub("T",str(seq.seq).strip())
+ '''seqRNA:transcript full sequence'''
+ seqRNA = seqRNA.upper()
+ seqCDS,start_pos,orf_strand,orf_fullness = FindCDS(seqRNA).longest_orf(strand)
+ '''seqCDS:longest ORF'''
+ seqprot = mRNA_translate(seqCDS)
+ pep_len = len(seqprot) #pep_len = len(seqprot.strip("*"))
+ newseqprot = strinfoAmbiguous.sub("",str(seqprot))
+ '''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence'''
+ fickett_score = fickett_obj.fickett_value(seqRNA)
+ protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*")))
+ if pep_len > 0:
+ #fickett_score = fickett_obj.fickett_value(seqCDS)
+ isoelectric_point = protein_param(protparam_obj)
+ else:
+ #fickett_score = 0.0
+ orf_fullness = -1
+ isoelectric_point = 0.0
+ ftmp_feat.write("\t".join(map(str,[seqid,len(seqRNA),pep_len,fickett_score,isoelectric_point,orf_fullness]))+"\n")
+ ftmp_svm.write("".join(map(str,["999"," 1:",pep_len," 2:",fickett_score," 3:",isoelectric_point," 4:",orf_fullness]))+"\n")
+ ftmp_feat.close()
+ ftmp_svm.close()
+ #return 0
+
+ '''
+ calculate the coding probability using LIBSVM
+ '''
+ sys.stderr.write("[INFO] Predicting coding potential, please wait ...\n")
+
+ '''
+ set directories and check depending tools existance
+ '''
+ script_dir,filename = os.path.split(os.path.abspath(sys.argv[0]))
+ data_dir = script_dir + "/../data/"
+ lib_dir = script_dir + "/../libs/"
+ app_svm_scale = lib_dir + "libsvm/libsvm-3.18/svm-scale"
+ app_svm_predict = lib_dir + "libsvm/libsvm-3.18/svm-predict"
+ os.system('test -x '+ app_svm_scale + ' || echo \"[ERROR] No excutable svm-scale on CPC2 path!\" > /dev/stderr')
+ os.system('test -x '+ app_svm_predict + ' || echo \"[ERROR] No excutable svm-predict on CPC2 path!\" > /dev/stderr')
+
+ cmd = app_svm_scale + ' -r ' + data_dir + 'cpc2.range ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
+ cmd = cmd + app_svm_predict + ' -b 1 -q ' + outfile + '.tmp.2 ' + data_dir + 'cpc2.model ' + outfile + '.tmp.1 &&'
+ cmd = cmd + 'awk -vOFS="\\t" \'{if ($1 == 1){print $2,"coding"} else if ($1 == 0){print $2,"noncoding"}}\' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
+ cmd = cmd + 'paste ' + outfile + '.feat ' + outfile + '.tmp.2 >>' + outfile
+ (exitstatus, outtext) = commands.getstatusoutput(cmd)
+ os.system('rm -f ' + outfile + '.tmp.1 ' + outfile + '.tmp.2')
+ # subprocess.call("Rscript " + outfile + '.r', shell=True)
+ #except:
+ # pass
+ if exitstatus == 0:
+ rm_cmd = "rm -f " + outfile + '.feat'
+ commands.getstatusoutput(rm_cmd)
+ sys.stderr.write("[INFO] Running Done!\n")
+ return 0
+ else:
+ sys.stderr.write("[ERROR] Prediction error!\n")
+ return -1
+
+if __name__ == "__main__":
+ sys.exit(__main())
diff --git a/bin/compress.py b/bin/compress.py
@@ -0,0 +1,29 @@
+'''
+This module deals with compressed file (.gz or .bz2)
+'''
+
+import gzip
+import bz2
+import sys
+def gz_file(fq_file,mode,level=6):
+ try:
+ if fq_file.endswith("gz"):
+ fq_fp = gzip.open(fq_file,mode+"b",level)
+ else:
+ sys.stderr.write("[INFO] read file '%s'\n"%fq_file)
+ fq_fp = file(fq_file,mode)
+ except:
+ sys.stderr.write("Error: Fail to IO file: %s\n"%(fq_file))
+ sys.exit(1)
+ return fq_fp
+
+
+def bz2file(f):
+ fz = None
+ if f.endswith("bz2"):
+ fz = bz2.BZ2File(f)
+ else:
+ sys.stderr.write("Error: Fail to IO file: %s\n"%(f))
+ sys.exit(1)
+ return fz
+