-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
3,758 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
* 2017-03-6 21:00, Yu-jian Kang | ||
|
||
Here is a temporary package of CPC2. | ||
|
||
|
||
1) Pre-requisite: | ||
|
||
a. Biopython package: a local version could be downloaded from | ||
http://biopython.org/wiki/Download | ||
|
||
2) Install | ||
|
||
a. Unpack the tarball: | ||
|
||
tom@linux$ gzip -dc CPC2-beta.tar.gz | tar xf - | ||
|
||
b. Build third-part packages: | ||
|
||
tom@linux$ cd CPC2-beta | ||
tom@linux$ export CPC_HOME="$PWD" | ||
tom@linux$ cd libs/libsvm | ||
tom@linux$ gzip -dc libsvm-3.18.tar.gz | tar xf - | ||
tom@linux$ cd libsvm-3.18 | ||
tom@linux$ make clean && make | ||
|
||
3) Run the predict | ||
|
||
tom@linux$ cd $CPC_HOME | ||
tom@linux$ bin/CPC2.py -i (input_seq) -o (result_in_table) | ||
|
||
example: | ||
tom@linux$ bin/CPC2.py -i data/example.fa -o example_output | ||
|
||
4) Output result | ||
|
||
Result in table format (delimited by tab): | ||
#ID peptide_length Fickett_score isoelectric_point ORF_integrity coding_probability coding_label | ||
|
||
|
||
See the website for tutorial and more details. (http://cpc2.cbi.pku.edu.cn) | ||
|
||
This is a beta version of CPC2, if have any questions please report to us. | ||
|
||
Contact: [email protected] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,315 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
import os | ||
import re | ||
import commands | ||
import time | ||
from optparse import OptionParser,OptionGroup | ||
|
||
import numpy as np | ||
from Bio.Seq import Seq | ||
from Bio.SeqUtils import ProtParam | ||
|
||
import seqio | ||
|
||
def __main(): | ||
start_time = time.time() | ||
usage = "usage: %prog [options] -i input.fasta -o output_file" | ||
description = "Contact: Kang Yujian <[email protected]>" | ||
parser = OptionParser(usage,version="%prog 0.1",description = description) | ||
Common_group = OptionGroup(parser,"Common Options") | ||
Common_group.add_option("-i",dest="fasta",help="input sequence in fasta format [Required]",metavar="FILE",type="string",default=None) | ||
Common_group.add_option("-o",dest="outfile",help="output file [Default: cpc2output.txt]",metavar="FILE",type="string",default="cpc2output.txt") | ||
Common_group.add_option("-r",dest="reverse",help="also check the reverse strand [Default: FALSE]",action="store_true") | ||
parser.add_option_group(Common_group) | ||
(options, args) = parser.parse_args() | ||
if options.fasta == None: | ||
parser.print_help() | ||
return -1 | ||
else: | ||
if not os.path.isfile(options.fasta): | ||
sys.stderr.write("[ERROR] %s is not a file\n"%options.fasta) | ||
return -1 | ||
if options.reverse: | ||
strand = "-" | ||
else: | ||
strand = "+" | ||
if calculate_potential(options.fasta,strand,options.outfile): | ||
return 1 | ||
sys.stderr.write("[INFO] cost time: %ds\n"%(time.time()-start_time)) | ||
return 0 | ||
|
||
class FindCDS: | ||
''' | ||
Find the most like CDS in a given sequence | ||
The most like CDS is the longest ORF found in the sequence | ||
When having same length, the upstream ORF is printed | ||
modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar | ||
''' | ||
def __init__(self,seq): | ||
self.seq = seq | ||
self.result = (0,0,0,0,0) | ||
self.longest = 0 | ||
self.basepair = {"A":"T","T":"A","U":"A","C":"G","G":"C","N":"N","X":"X"} | ||
|
||
def _reversecompliment(self): | ||
return "".join(self.basepair[base] for base in self.seq)[::-1] | ||
|
||
def get_codons(self,frame_number): | ||
''' | ||
Record every nucleotide triplet and its coordinate position for input sequence in one frame | ||
''' | ||
coordinate = frame_number | ||
while coordinate + 3 <= len(self.seq): | ||
yield (self.seq[coordinate:coordinate+3], coordinate) | ||
coordinate += 3 | ||
|
||
def find_longest_in_one(self,myframe,direction,start_codon,stop_codon): | ||
''' | ||
find the longest ORF in one reading myframe | ||
''' | ||
triplet_got = self.get_codons(myframe) | ||
starts = start_codon | ||
stops = stop_codon | ||
''' | ||
Extend sequence by triplet after start codon encountered | ||
End ORF extension when stop codon encountered | ||
''' | ||
while True: | ||
try: | ||
codon,index = triplet_got.next() | ||
except StopIteration: | ||
break | ||
if codon in starts and codon not in stops: | ||
''' | ||
find the ORF start | ||
''' | ||
orf_start = index | ||
end_extension = False | ||
while True: | ||
try: | ||
codon,index = triplet_got.next() | ||
except StopIteration: | ||
end_extension = True | ||
integrity = -1 | ||
if codon in stops: | ||
integrity = 1 | ||
end_extension = True | ||
if end_extension: | ||
orf_end = index + 3 | ||
Length = (orf_end - orf_start) | ||
if Length > self.longest: | ||
self.longest = Length | ||
self.result = [direction,orf_start,orf_end,Length,integrity] | ||
if Length == self.longest and orf_start < self.result[1]: | ||
''' | ||
if ORFs have same length, return the one that if upstream | ||
''' | ||
self.result = [direction,orf_start,orf_end,Length,integrity] | ||
break | ||
|
||
def longest_orf(self,direction,start_codon={"ATG":None}, stop_codon={"TAG":None,"TAA":None,"TGA":None}): | ||
return_orf = "" | ||
for frame in range(3): | ||
self.find_longest_in_one(frame,"+",start_codon,stop_codon) | ||
return_orf = self.seq[self.result[1]:self.result[2]][:] | ||
start_coordinate = self.result[1] | ||
strand_direction = "+" | ||
orf_integrity = self.result[4] | ||
''' | ||
Also check reverse chain if -r is chosen | ||
''' | ||
if direction == "-": | ||
self.seq = self._reversecompliment() | ||
for frame in range(3): | ||
self.find_longest_in_one(frame,"-",start_codon,stop_codon) | ||
if self.result[0] == "-": | ||
return_orf = self.seq[self.result[1]:self.result[2]][:] | ||
start_coordinate = self.result[1] | ||
strand_direction = "-" | ||
orf_integrity = self.result[4] | ||
return return_orf,start_coordinate,strand_direction,orf_integrity | ||
|
||
|
||
class Fickett: | ||
''' | ||
calculate Fickett TESTCODE for full sequence | ||
NAR 10(17) 5303-531 | ||
modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar | ||
''' | ||
def __init__(self): | ||
'''new compiled Fickett look-up table''' | ||
self.position_parameter = [1.9,1.8,1.7,1.6,1.5,1.4,1.3,1.2,1.1,0.0] | ||
self.content_parameter = [0.33,0.31,0.29,0.27,0.25,0.23,0.21,0.19,0.17,0] | ||
self.position_probability = { | ||
"A":[0.51,0.55,0.57,0.52,0.48,0.58,0.57,0.54,0.50,0.36], | ||
"C":[0.29,0.44,0.55,0.49,0.52,0.60,0.60,0.56,0.51,0.38], | ||
"G":[0.62,0.67,0.74,0.65,0.61,0.62,0.52,0.41,0.31,0.17], | ||
"T":[0.51,0.60,0.69,0.64,0.62,0.67,0.58,0.48,0.39,0.24], | ||
} | ||
self.position_weight = {"A":0.062,"C":0.093,"G":0.205,"T":0.154} | ||
self.content_probability = { | ||
"A":[0.40,0.55,0.58,0.58,0.52,0.48,0.45,0.45,0.38,0.19], | ||
"C":[0.50,0.63,0.59,0.50,0.46,0.45,0.47,0.56,0.59,0.33], | ||
"G":[0.21,0.40,0.47,0.50,0.52,0.56,0.57,0.52,0.44,0.23], | ||
"T":[0.30,0.49,0.56,0.53,0.48,0.48,0.52,0.57,0.60,0.51] | ||
} | ||
self.content_weight = {"A":0.084,"C":0.076,"G":0.081,"T":0.055} | ||
|
||
|
||
def look_up_position_probability(self,value, base): | ||
''' | ||
look up positional probability by base and value | ||
''' | ||
if float(value) < 0: | ||
return None | ||
for idx,val in enumerate (self.position_parameter): | ||
if (float(value) >= val): | ||
return float(self.position_probability[base][idx]) * float(self.position_weight[base]) | ||
|
||
def look_up_content_probability(self,value, base): | ||
''' | ||
look up content probability by base and value | ||
''' | ||
if float(value) < 0: | ||
return None | ||
for idx,val in enumerate (self.content_parameter): | ||
if (float(value) >= val): | ||
return float(self.content_probability[base][idx]) * float(self.content_weight[base]) | ||
|
||
def fickett_value(self,dna): | ||
''' | ||
calculate Fickett value from full RNA transcript sequence | ||
''' | ||
if len(dna) < 2: | ||
return 0 | ||
fickett_score=0 | ||
dna=dna | ||
total_base = len(dna) | ||
A_content = float(dna.count("A"))/total_base | ||
C_content = float(dna.count("C"))/total_base | ||
G_content = float(dna.count("G"))/total_base | ||
T_content = float(dna.count("T"))/total_base | ||
|
||
phase_0 = dna[::3] | ||
phase_1 = dna[1::3] | ||
phase_2 = dna[2::3] | ||
|
||
phase_0_A = phase_0.count("A") | ||
phase_1_A = phase_1.count("A") | ||
phase_2_A = phase_2.count("A") | ||
phase_0_C = phase_0.count("C") | ||
phase_1_C = phase_1.count("C") | ||
phase_2_C = phase_2.count("C") | ||
phase_0_G = phase_0.count("G") | ||
phase_1_G = phase_1.count("G") | ||
phase_2_G = phase_2.count("G") | ||
phase_0_T = phase_0.count("T") | ||
phase_1_T = phase_1.count("T") | ||
phase_2_T = phase_2.count("T") | ||
|
||
A_content = float(phase_0_A + phase_1_A + phase_2_A)/total_base | ||
C_content = float(phase_0_C + phase_1_C + phase_2_C)/total_base | ||
G_content = float(phase_0_G + phase_1_G + phase_2_G)/total_base | ||
T_content = float(phase_0_T + phase_1_T + phase_2_T)/total_base | ||
A_position= np.max([phase_0_A,phase_1_A,phase_2_A])/(np.min([phase_0_A,phase_1_A,phase_2_A]) +1.0) | ||
C_position= np.max([phase_0_C,phase_1_C,phase_2_C])/(np.min([phase_0_C,phase_1_C,phase_2_C]) +1.0) | ||
G_position= np.max([phase_0_G,phase_1_G,phase_2_G])/(np.min([phase_0_G,phase_1_G,phase_2_G]) +1.0) | ||
T_position= np.max([phase_0_T,phase_1_T,phase_2_T])/(np.min([phase_0_T,phase_1_T,phase_2_T]) +1.0) | ||
|
||
fickett_score += self.look_up_content_probability(A_content,"A") | ||
fickett_score += self.look_up_content_probability(C_content,"C") | ||
fickett_score += self.look_up_content_probability(G_content,"G") | ||
fickett_score += self.look_up_content_probability(T_content,"T") | ||
|
||
fickett_score += self.look_up_position_probability(A_position,"A") | ||
fickett_score += self.look_up_position_probability(C_position,"C") | ||
fickett_score += self.look_up_position_probability(G_position,"G") | ||
fickett_score += self.look_up_position_probability(T_position,"T") | ||
|
||
return fickett_score | ||
|
||
#=================== | ||
|
||
def mRNA_translate(mRNA): | ||
return Seq(mRNA).translate() | ||
|
||
def protein_param(putative_seqprot): | ||
return putative_seqprot.isoelectric_point() | ||
|
||
def calculate_potential(fasta,strand,outfile): | ||
''' | ||
Calculate three features: putative peptide length,pI and Fickett | ||
And assess coding potential based on SVM model | ||
''' | ||
strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I) | ||
ptU = re.compile("U",re.I) | ||
ftmp_feat = file(outfile + ".feat","w") | ||
ftmp_svm = file(outfile + ".tmp.1","w") | ||
ftmp_result = file(outfile,"w") | ||
ftmp_result.write("\t".join(map(str,["#ID","transcript_length","peptide_length","Fickett_score","pI","ORF_integrity","coding_probability","label"]))+"\n") | ||
ftmp_result.close() | ||
fickett_obj = Fickett() | ||
for seq in seqio.fasta_read(fasta): | ||
seqid = seq.id | ||
seqRNA = ptU.sub("T",str(seq.seq).strip()) | ||
'''seqRNA:transcript full sequence''' | ||
seqRNA = seqRNA.upper() | ||
seqCDS,start_pos,orf_strand,orf_fullness = FindCDS(seqRNA).longest_orf(strand) | ||
'''seqCDS:longest ORF''' | ||
seqprot = mRNA_translate(seqCDS) | ||
pep_len = len(seqprot) #pep_len = len(seqprot.strip("*")) | ||
newseqprot = strinfoAmbiguous.sub("",str(seqprot)) | ||
'''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence''' | ||
fickett_score = fickett_obj.fickett_value(seqRNA) | ||
protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*"))) | ||
if pep_len > 0: | ||
#fickett_score = fickett_obj.fickett_value(seqCDS) | ||
isoelectric_point = protein_param(protparam_obj) | ||
else: | ||
#fickett_score = 0.0 | ||
orf_fullness = -1 | ||
isoelectric_point = 0.0 | ||
ftmp_feat.write("\t".join(map(str,[seqid,len(seqRNA),pep_len,fickett_score,isoelectric_point,orf_fullness]))+"\n") | ||
ftmp_svm.write("".join(map(str,["999"," 1:",pep_len," 2:",fickett_score," 3:",isoelectric_point," 4:",orf_fullness]))+"\n") | ||
ftmp_feat.close() | ||
ftmp_svm.close() | ||
#return 0 | ||
|
||
''' | ||
calculate the coding probability using LIBSVM | ||
''' | ||
sys.stderr.write("[INFO] Predicting coding potential, please wait ...\n") | ||
|
||
''' | ||
set directories and check depending tools existance | ||
''' | ||
script_dir,filename = os.path.split(os.path.abspath(sys.argv[0])) | ||
data_dir = script_dir + "/../data/" | ||
lib_dir = script_dir + "/../libs/" | ||
app_svm_scale = lib_dir + "libsvm/libsvm-3.18/svm-scale" | ||
app_svm_predict = lib_dir + "libsvm/libsvm-3.18/svm-predict" | ||
os.system('test -x '+ app_svm_scale + ' || echo \"[ERROR] No excutable svm-scale on CPC2 path!\" > /dev/stderr') | ||
os.system('test -x '+ app_svm_predict + ' || echo \"[ERROR] No excutable svm-predict on CPC2 path!\" > /dev/stderr') | ||
|
||
cmd = app_svm_scale + ' -r ' + data_dir + 'cpc2.range ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&' | ||
cmd = cmd + app_svm_predict + ' -b 1 -q ' + outfile + '.tmp.2 ' + data_dir + 'cpc2.model ' + outfile + '.tmp.1 &&' | ||
cmd = cmd + 'awk -vOFS="\\t" \'{if ($1 == 1){print $2,"coding"} else if ($1 == 0){print $2,"noncoding"}}\' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&' | ||
cmd = cmd + 'paste ' + outfile + '.feat ' + outfile + '.tmp.2 >>' + outfile | ||
(exitstatus, outtext) = commands.getstatusoutput(cmd) | ||
os.system('rm -f ' + outfile + '.tmp.1 ' + outfile + '.tmp.2') | ||
# subprocess.call("Rscript " + outfile + '.r', shell=True) | ||
#except: | ||
# pass | ||
if exitstatus == 0: | ||
rm_cmd = "rm -f " + outfile + '.feat' | ||
commands.getstatusoutput(rm_cmd) | ||
sys.stderr.write("[INFO] Running Done!\n") | ||
return 0 | ||
else: | ||
sys.stderr.write("[ERROR] Prediction error!\n") | ||
return -1 | ||
|
||
if __name__ == "__main__": | ||
sys.exit(__main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
''' | ||
This module deals with compressed file (.gz or .bz2) | ||
''' | ||
|
||
import gzip | ||
import bz2 | ||
import sys | ||
def gz_file(fq_file,mode,level=6): | ||
try: | ||
if fq_file.endswith("gz"): | ||
fq_fp = gzip.open(fq_file,mode+"b",level) | ||
else: | ||
sys.stderr.write("[INFO] read file '%s'\n"%fq_file) | ||
fq_fp = file(fq_file,mode) | ||
except: | ||
sys.stderr.write("Error: Fail to IO file: %s\n"%(fq_file)) | ||
sys.exit(1) | ||
return fq_fp | ||
|
||
|
||
def bz2file(f): | ||
fz = None | ||
if f.endswith("bz2"): | ||
fz = bz2.BZ2File(f) | ||
else: | ||
sys.stderr.write("Error: Fail to IO file: %s\n"%(f)) | ||
sys.exit(1) | ||
return fz | ||
|
Oops, something went wrong.