-
Notifications
You must be signed in to change notification settings - Fork 81
/
goseq_for_cuffdiff
114 lines (85 loc) · 3.63 KB
/
goseq_for_cuffdiff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#######this R script does GO enrichment analysis for 6 gene expression difference files generated by CUfflinks
####### Author NI SHUAI
####### Version 1 31.12.2014
################################################
###### load the packages required for analysis
source("http://bioconductor.org/biocLite.R")
biocLite("goseq",dependencies=TRUE)
library('goseq')
## it should work, but I guess it's the internet that doesn't do the job today.
## Good! now it works!
## specify the file location
### When doing the analysis, one should use dir() everytime before running
### the enrichment analysis to check the file location and assign the index
### of the file in dir() to file_number.
### following is the infomation that need to be filled by user
file_location='D://oulu_wei//diff_files'
setwd(file_location)
dir()
file_number=5
### which files is to be used for enrichment analysis
which_file=dir()[file_number]
################################################
## load the files into R
diff_file=read.table(dir()[file_number],header=T)
whole_gene=as.character(unique(diff_file$gene))
whole_gene=whole_gene[whole_gene!='-']
diff_gene=as.character(diff_file$gene[diff_file$p_value < 0.05])
##################some of the names in whole_gene contain several synonyms separated by comma
##################it causes confusion for GO gene name retrieval, so the first
##################name is left and others deleted
name_revise=function(gene_set){
for (i in 1:length(gene_set)){
if (grepl(',',gene_set[i])){
index=which(strsplit(gene_set[i],'')[[1]]==',')[1]
gene_set[i]=substr(gene_set[i],1,index-1)
}
}
gene_set
}
whole_gene=name_revise(whole_gene)
diff_gene=name_revise(diff_gene)
whole_gene=as.character(unique(whole_gene))
diff_gene=as.character(unique(diff_gene))
### initiate the files in required format
genes=as.integer(whole_gene %in% diff_gene)
names(genes)=whole_gene
###because whole_gene is a uniquefied set of gene names, '-' only
###apprear once, but diff_genes contain more of those, not a problem though.
###fitting the probability weighting function
pwf=nullp(genes,'hg19','geneSymbol')
GO.wall=goseq(pwf,'hg19','geneSymbol')
#### Find only enriched GO terms that are statistically significant
enriched.GO=GO.wall$category[GO.wall$over_represented_pvalue<.0001]
head(enriched.GO)
##### get more biological meaningful results from the GO IDs.
###just define the file name to make it a bit easier to understand
file_head=substr(which_file,1,nchar(which_file)-5)
######################################
### save the GOseq result into a plain text file
######################################
library(GO.db)
capture.output(for(go in enriched.GO[1:length(enriched.GO)]) { print(GOTERM[[go]])
cat("--------------------------------------\n")
cat(paste("-----------" , file_head , "---------",sep=""))
cat('\n')
cat("--------------------------------------\n")
}
, file=paste(file_head, "_sigGO.txt", sep=""))
####################################
###if you want you can always save the sig_diff gene names
###into a file for further analysis e.g. DAVID
####################################
file_location='D://oulu_wei//diff_files'
setwd(file_location)
dir()
file_number=????fill it here yourself
which_file=dir()[file_number]
diff_file=read.table(dir()[file_number],header=T)
whole_gene=as.character(unique(diff_file$gene))
whole_gene=whole_gene[whole_gene!='-']
diff_gene=as.character(diff_file$gene[diff_file$p_value < 0.05])
########################write file
file_head=substr(which_file,1,nchar(which_file)-5)
write(whole_gene,file=paste(file_head, "wholegenes.txt",sep="") ,sep='\n')
write(diff_gene,file=paste(file_head, "diffgenes.txt",sep="") ,sep='\n')