goseq_for_cuffdiff

#######this R script does GO enrichment analysis for 6 gene expression difference files generated by CUfflinks
####### Author NI SHUAI
####### Version 1 31.12.2014
################################################
###### load the packages required for analysis
source("http://bioconductor.org/biocLite.R")
biocLite("goseq",dependencies=TRUE)
library('goseq')
## it should work, but I guess it's the internet that doesn't do the job today.
## Good! now it works!


## specify the file location
### When doing the analysis, one should use dir() everytime before running
### the enrichment analysis to check the file location and assign the index
### of the file in dir() to file_number.
### following is the infomation that need to be filled by user

file_location='D://oulu_wei//diff_files'
setwd(file_location)
dir()
file_number=5


### which files is to be used for enrichment analysis

which_file=dir()[file_number]

################################################
## load the files into R
diff_file=read.table(dir()[file_number],header=T)
whole_gene=as.character(unique(diff_file$gene))
whole_gene=whole_gene[whole_gene!='-']
diff_gene=as.character(diff_file$gene[diff_file$p_value < 0.05])

##################some of the names in whole_gene contain several synonyms separated by comma
##################it causes confusion for GO gene name retrieval, so the first 
##################name is left and others deleted
name_revise=function(gene_set){
  for (i in 1:length(gene_set)){
   if (grepl(',',gene_set[i])){
     index=which(strsplit(gene_set[i],'')[[1]]==',')[1]
      gene_set[i]=substr(gene_set[i],1,index-1)
  }
}
gene_set
}

whole_gene=name_revise(whole_gene)
diff_gene=name_revise(diff_gene)
whole_gene=as.character(unique(whole_gene))
diff_gene=as.character(unique(diff_gene))

### initiate the files in required format

genes=as.integer(whole_gene %in% diff_gene)
names(genes)=whole_gene

###because whole_gene is a uniquefied set of gene names, '-' only 
###apprear once, but diff_genes contain more of those, not a problem though.
###fitting the probability weighting function
pwf=nullp(genes,'hg19','geneSymbol')
GO.wall=goseq(pwf,'hg19','geneSymbol')

#### Find only enriched GO terms that are statistically significant
enriched.GO=GO.wall$category[GO.wall$over_represented_pvalue<.0001]
head(enriched.GO)


##### get more biological meaningful results from the GO IDs. 

###just define the file name to make it a bit easier to understand

file_head=substr(which_file,1,nchar(which_file)-5)

######################################
### save the GOseq result into a plain text file
######################################

library(GO.db)
capture.output(for(go in enriched.GO[1:length(enriched.GO)]) { print(GOTERM[[go]])
cat("--------------------------------------\n")
cat(paste("-----------" , file_head , "---------",sep=""))
cat('\n')
cat("--------------------------------------\n")
}
, file=paste(file_head, "_sigGO.txt", sep=""))

####################################
###if you want you can always save the sig_diff gene  names
###into a file for further analysis e.g. DAVID
####################################


file_location='D://oulu_wei//diff_files'
setwd(file_location)
dir()
file_number=????fill it here yourself

which_file=dir()[file_number]
diff_file=read.table(dir()[file_number],header=T)
whole_gene=as.character(unique(diff_file$gene))
whole_gene=whole_gene[whole_gene!='-']
diff_gene=as.character(diff_file$gene[diff_file$p_value < 0.05])


########################write file

file_head=substr(which_file,1,nchar(which_file)-5)
write(whole_gene,file=paste(file_head, "wholegenes.txt",sep="") ,sep='\n')
write(diff_gene,file=paste(file_head, "diffgenes.txt",sep="") ,sep='\n')