Skip to content

Commit

Permalink
CPC2 init
Browse files Browse the repository at this point in the history
  • Loading branch information
kkkyj committed Mar 15, 2018
1 parent 528a411 commit 481d361
Show file tree
Hide file tree
Showing 8 changed files with 3,758 additions and 0 deletions.
44 changes: 44 additions & 0 deletions 00README
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
* 2017-03-6 21:00, Yu-jian Kang

Here is a temporary package of CPC2.


1) Pre-requisite:

a. Biopython package: a local version could be downloaded from
http://biopython.org/wiki/Download

2) Install

a. Unpack the tarball:

tom@linux$ gzip -dc CPC2-beta.tar.gz | tar xf -

b. Build third-part packages:

tom@linux$ cd CPC2-beta
tom@linux$ export CPC_HOME="$PWD"
tom@linux$ cd libs/libsvm
tom@linux$ gzip -dc libsvm-3.18.tar.gz | tar xf -
tom@linux$ cd libsvm-3.18
tom@linux$ make clean && make

3) Run the predict

tom@linux$ cd $CPC_HOME
tom@linux$ bin/CPC2.py -i (input_seq) -o (result_in_table)

example:
tom@linux$ bin/CPC2.py -i data/example.fa -o example_output

4) Output result

Result in table format (delimited by tab):
#ID peptide_length Fickett_score isoelectric_point ORF_integrity coding_probability coding_label


See the website for tutorial and more details. (http://cpc2.cbi.pku.edu.cn)

This is a beta version of CPC2, if have any questions please report to us.

Contact: [email protected]
315 changes: 315 additions & 0 deletions bin/CPC2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
#!/usr/bin/env python

import sys
import os
import re
import commands
import time
from optparse import OptionParser,OptionGroup

import numpy as np
from Bio.Seq import Seq
from Bio.SeqUtils import ProtParam

import seqio

def __main():
start_time = time.time()
usage = "usage: %prog [options] -i input.fasta -o output_file"
description = "Contact: Kang Yujian <[email protected]>"
parser = OptionParser(usage,version="%prog 0.1",description = description)
Common_group = OptionGroup(parser,"Common Options")
Common_group.add_option("-i",dest="fasta",help="input sequence in fasta format [Required]",metavar="FILE",type="string",default=None)
Common_group.add_option("-o",dest="outfile",help="output file [Default: cpc2output.txt]",metavar="FILE",type="string",default="cpc2output.txt")
Common_group.add_option("-r",dest="reverse",help="also check the reverse strand [Default: FALSE]",action="store_true")
parser.add_option_group(Common_group)
(options, args) = parser.parse_args()
if options.fasta == None:
parser.print_help()
return -1
else:
if not os.path.isfile(options.fasta):
sys.stderr.write("[ERROR] %s is not a file\n"%options.fasta)
return -1
if options.reverse:
strand = "-"
else:
strand = "+"
if calculate_potential(options.fasta,strand,options.outfile):
return 1
sys.stderr.write("[INFO] cost time: %ds\n"%(time.time()-start_time))
return 0

class FindCDS:
'''
Find the most like CDS in a given sequence
The most like CDS is the longest ORF found in the sequence
When having same length, the upstream ORF is printed
modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar
'''
def __init__(self,seq):
self.seq = seq
self.result = (0,0,0,0,0)
self.longest = 0
self.basepair = {"A":"T","T":"A","U":"A","C":"G","G":"C","N":"N","X":"X"}

def _reversecompliment(self):
return "".join(self.basepair[base] for base in self.seq)[::-1]

def get_codons(self,frame_number):
'''
Record every nucleotide triplet and its coordinate position for input sequence in one frame
'''
coordinate = frame_number
while coordinate + 3 <= len(self.seq):
yield (self.seq[coordinate:coordinate+3], coordinate)
coordinate += 3

def find_longest_in_one(self,myframe,direction,start_codon,stop_codon):
'''
find the longest ORF in one reading myframe
'''
triplet_got = self.get_codons(myframe)
starts = start_codon
stops = stop_codon
'''
Extend sequence by triplet after start codon encountered
End ORF extension when stop codon encountered
'''
while True:
try:
codon,index = triplet_got.next()
except StopIteration:
break
if codon in starts and codon not in stops:
'''
find the ORF start
'''
orf_start = index
end_extension = False
while True:
try:
codon,index = triplet_got.next()
except StopIteration:
end_extension = True
integrity = -1
if codon in stops:
integrity = 1
end_extension = True
if end_extension:
orf_end = index + 3
Length = (orf_end - orf_start)
if Length > self.longest:
self.longest = Length
self.result = [direction,orf_start,orf_end,Length,integrity]
if Length == self.longest and orf_start < self.result[1]:
'''
if ORFs have same length, return the one that if upstream
'''
self.result = [direction,orf_start,orf_end,Length,integrity]
break

def longest_orf(self,direction,start_codon={"ATG":None}, stop_codon={"TAG":None,"TAA":None,"TGA":None}):
return_orf = ""
for frame in range(3):
self.find_longest_in_one(frame,"+",start_codon,stop_codon)
return_orf = self.seq[self.result[1]:self.result[2]][:]
start_coordinate = self.result[1]
strand_direction = "+"
orf_integrity = self.result[4]
'''
Also check reverse chain if -r is chosen
'''
if direction == "-":
self.seq = self._reversecompliment()
for frame in range(3):
self.find_longest_in_one(frame,"-",start_codon,stop_codon)
if self.result[0] == "-":
return_orf = self.seq[self.result[1]:self.result[2]][:]
start_coordinate = self.result[1]
strand_direction = "-"
orf_integrity = self.result[4]
return return_orf,start_coordinate,strand_direction,orf_integrity


class Fickett:
'''
calculate Fickett TESTCODE for full sequence
NAR 10(17) 5303-531
modified from source code of CPAT 1.2.1 downloaded from https://sourceforge.net/projects/rna-cpat/files/?source=navbar
'''
def __init__(self):
'''new compiled Fickett look-up table'''
self.position_parameter = [1.9,1.8,1.7,1.6,1.5,1.4,1.3,1.2,1.1,0.0]
self.content_parameter = [0.33,0.31,0.29,0.27,0.25,0.23,0.21,0.19,0.17,0]
self.position_probability = {
"A":[0.51,0.55,0.57,0.52,0.48,0.58,0.57,0.54,0.50,0.36],
"C":[0.29,0.44,0.55,0.49,0.52,0.60,0.60,0.56,0.51,0.38],
"G":[0.62,0.67,0.74,0.65,0.61,0.62,0.52,0.41,0.31,0.17],
"T":[0.51,0.60,0.69,0.64,0.62,0.67,0.58,0.48,0.39,0.24],
}
self.position_weight = {"A":0.062,"C":0.093,"G":0.205,"T":0.154}
self.content_probability = {
"A":[0.40,0.55,0.58,0.58,0.52,0.48,0.45,0.45,0.38,0.19],
"C":[0.50,0.63,0.59,0.50,0.46,0.45,0.47,0.56,0.59,0.33],
"G":[0.21,0.40,0.47,0.50,0.52,0.56,0.57,0.52,0.44,0.23],
"T":[0.30,0.49,0.56,0.53,0.48,0.48,0.52,0.57,0.60,0.51]
}
self.content_weight = {"A":0.084,"C":0.076,"G":0.081,"T":0.055}


def look_up_position_probability(self,value, base):
'''
look up positional probability by base and value
'''
if float(value) < 0:
return None
for idx,val in enumerate (self.position_parameter):
if (float(value) >= val):
return float(self.position_probability[base][idx]) * float(self.position_weight[base])

def look_up_content_probability(self,value, base):
'''
look up content probability by base and value
'''
if float(value) < 0:
return None
for idx,val in enumerate (self.content_parameter):
if (float(value) >= val):
return float(self.content_probability[base][idx]) * float(self.content_weight[base])

def fickett_value(self,dna):
'''
calculate Fickett value from full RNA transcript sequence
'''
if len(dna) < 2:
return 0
fickett_score=0
dna=dna
total_base = len(dna)
A_content = float(dna.count("A"))/total_base
C_content = float(dna.count("C"))/total_base
G_content = float(dna.count("G"))/total_base
T_content = float(dna.count("T"))/total_base

phase_0 = dna[::3]
phase_1 = dna[1::3]
phase_2 = dna[2::3]

phase_0_A = phase_0.count("A")
phase_1_A = phase_1.count("A")
phase_2_A = phase_2.count("A")
phase_0_C = phase_0.count("C")
phase_1_C = phase_1.count("C")
phase_2_C = phase_2.count("C")
phase_0_G = phase_0.count("G")
phase_1_G = phase_1.count("G")
phase_2_G = phase_2.count("G")
phase_0_T = phase_0.count("T")
phase_1_T = phase_1.count("T")
phase_2_T = phase_2.count("T")

A_content = float(phase_0_A + phase_1_A + phase_2_A)/total_base
C_content = float(phase_0_C + phase_1_C + phase_2_C)/total_base
G_content = float(phase_0_G + phase_1_G + phase_2_G)/total_base
T_content = float(phase_0_T + phase_1_T + phase_2_T)/total_base
A_position= np.max([phase_0_A,phase_1_A,phase_2_A])/(np.min([phase_0_A,phase_1_A,phase_2_A]) +1.0)
C_position= np.max([phase_0_C,phase_1_C,phase_2_C])/(np.min([phase_0_C,phase_1_C,phase_2_C]) +1.0)
G_position= np.max([phase_0_G,phase_1_G,phase_2_G])/(np.min([phase_0_G,phase_1_G,phase_2_G]) +1.0)
T_position= np.max([phase_0_T,phase_1_T,phase_2_T])/(np.min([phase_0_T,phase_1_T,phase_2_T]) +1.0)

fickett_score += self.look_up_content_probability(A_content,"A")
fickett_score += self.look_up_content_probability(C_content,"C")
fickett_score += self.look_up_content_probability(G_content,"G")
fickett_score += self.look_up_content_probability(T_content,"T")

fickett_score += self.look_up_position_probability(A_position,"A")
fickett_score += self.look_up_position_probability(C_position,"C")
fickett_score += self.look_up_position_probability(G_position,"G")
fickett_score += self.look_up_position_probability(T_position,"T")

return fickett_score

#===================

def mRNA_translate(mRNA):
return Seq(mRNA).translate()

def protein_param(putative_seqprot):
return putative_seqprot.isoelectric_point()

def calculate_potential(fasta,strand,outfile):
'''
Calculate three features: putative peptide length,pI and Fickett
And assess coding potential based on SVM model
'''
strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I)
ptU = re.compile("U",re.I)
ftmp_feat = file(outfile + ".feat","w")
ftmp_svm = file(outfile + ".tmp.1","w")
ftmp_result = file(outfile,"w")
ftmp_result.write("\t".join(map(str,["#ID","transcript_length","peptide_length","Fickett_score","pI","ORF_integrity","coding_probability","label"]))+"\n")
ftmp_result.close()
fickett_obj = Fickett()
for seq in seqio.fasta_read(fasta):
seqid = seq.id
seqRNA = ptU.sub("T",str(seq.seq).strip())
'''seqRNA:transcript full sequence'''
seqRNA = seqRNA.upper()
seqCDS,start_pos,orf_strand,orf_fullness = FindCDS(seqRNA).longest_orf(strand)
'''seqCDS:longest ORF'''
seqprot = mRNA_translate(seqCDS)
pep_len = len(seqprot) #pep_len = len(seqprot.strip("*"))
newseqprot = strinfoAmbiguous.sub("",str(seqprot))
'''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence'''
fickett_score = fickett_obj.fickett_value(seqRNA)
protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*")))
if pep_len > 0:
#fickett_score = fickett_obj.fickett_value(seqCDS)
isoelectric_point = protein_param(protparam_obj)
else:
#fickett_score = 0.0
orf_fullness = -1
isoelectric_point = 0.0
ftmp_feat.write("\t".join(map(str,[seqid,len(seqRNA),pep_len,fickett_score,isoelectric_point,orf_fullness]))+"\n")
ftmp_svm.write("".join(map(str,["999"," 1:",pep_len," 2:",fickett_score," 3:",isoelectric_point," 4:",orf_fullness]))+"\n")
ftmp_feat.close()
ftmp_svm.close()
#return 0

'''
calculate the coding probability using LIBSVM
'''
sys.stderr.write("[INFO] Predicting coding potential, please wait ...\n")

'''
set directories and check depending tools existance
'''
script_dir,filename = os.path.split(os.path.abspath(sys.argv[0]))
data_dir = script_dir + "/../data/"
lib_dir = script_dir + "/../libs/"
app_svm_scale = lib_dir + "libsvm/libsvm-3.18/svm-scale"
app_svm_predict = lib_dir + "libsvm/libsvm-3.18/svm-predict"
os.system('test -x '+ app_svm_scale + ' || echo \"[ERROR] No excutable svm-scale on CPC2 path!\" > /dev/stderr')
os.system('test -x '+ app_svm_predict + ' || echo \"[ERROR] No excutable svm-predict on CPC2 path!\" > /dev/stderr')

cmd = app_svm_scale + ' -r ' + data_dir + 'cpc2.range ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
cmd = cmd + app_svm_predict + ' -b 1 -q ' + outfile + '.tmp.2 ' + data_dir + 'cpc2.model ' + outfile + '.tmp.1 &&'
cmd = cmd + 'awk -vOFS="\\t" \'{if ($1 == 1){print $2,"coding"} else if ($1 == 0){print $2,"noncoding"}}\' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
cmd = cmd + 'paste ' + outfile + '.feat ' + outfile + '.tmp.2 >>' + outfile
(exitstatus, outtext) = commands.getstatusoutput(cmd)
os.system('rm -f ' + outfile + '.tmp.1 ' + outfile + '.tmp.2')
# subprocess.call("Rscript " + outfile + '.r', shell=True)
#except:
# pass
if exitstatus == 0:
rm_cmd = "rm -f " + outfile + '.feat'
commands.getstatusoutput(rm_cmd)
sys.stderr.write("[INFO] Running Done!\n")
return 0
else:
sys.stderr.write("[ERROR] Prediction error!\n")
return -1

if __name__ == "__main__":
sys.exit(__main())
29 changes: 29 additions & 0 deletions bin/compress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
'''
This module deals with compressed file (.gz or .bz2)
'''

import gzip
import bz2
import sys
def gz_file(fq_file,mode,level=6):
try:
if fq_file.endswith("gz"):
fq_fp = gzip.open(fq_file,mode+"b",level)
else:
sys.stderr.write("[INFO] read file '%s'\n"%fq_file)
fq_fp = file(fq_file,mode)
except:
sys.stderr.write("Error: Fail to IO file: %s\n"%(fq_file))
sys.exit(1)
return fq_fp


def bz2file(f):
fz = None
if f.endswith("bz2"):
fz = bz2.BZ2File(f)
else:
sys.stderr.write("Error: Fail to IO file: %s\n"%(f))
sys.exit(1)
return fz

Loading

0 comments on commit 481d361

Please sign in to comment.