QTL_QC.R

#!/hpc/local/CentOS7/dhl_ec/software/R-3.4.0/bin/Rscript --vanilla

# Alternative shebang for local Mac OS X: "#!/usr/local/bin/Rscript --vanilla"
# Linux version for HPC: #!/hpc/local/CentOS7/dhl_ec/software/R-3.4.0/bin/Rscript --vanilla

VERSION="v2.3.10"
LASTEDITDATE="2019-10-15"
SCRIPTNAME="Molecular QTL results Quality Contrl & Parser"
AUTHOR="Sander W. van der Laan | s.w.vanderlaan@gmail.com | @swvanderlaan | swvanderlaan.github.io"
THISYEAR = format(as.Date(as.POSIXlt(Sys.time())), "%Y")

cat(paste0("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
",SCRIPTNAME,"
",VERSION," - ",LASTEDITDATE,"

(C)1979-",THISYEAR," | ",AUTHOR,".
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"))
# usage: ./QTL_QC.R -p projectdir -r resultfile -o outputdir -t resulttype -q qtltype -a annotfile -j genstatsfile [OPTIONAL: -v verbose (DEFAULT) -q quiet]
#        ./QTL_QC.R --projectdir projectdir --resultsfile resultfile --outputdir outputdir --resulttype resulttype --qtltype qtltype --annotfile annotfile --genstats genestatfile [OPTIONAL: --verbose verbose (DEFAULT) -quiet quiet]

cat("\n* Clearing the environment...\n\n")
### CLEAR THE BOARD

cat("\n* Loading function to install packages...\n\n")
### Prerequisite: 'optparse'-library
### * Manual: http://cran.r-project.org/web/packages/optparse/optparse.pdf
### * Vignette: http://www.icesi.edu.co/CRAN/web/packages/optparse/vignettes/optparse.pdf

### Don't say "Loading required package: optparse"...
###suppressPackageStartupMessages(require(optparse))
###require(optparse)

### The part of installing (and loading) packages via Rscript doesn't properly work.
### FUNCTION TO INSTALL PACKAGES
install.packages.auto <- function(x) { 
  x <- as.character(substitute(x)) 
  if(isTRUE(x %in% .packages(all.available = TRUE))) { 
    eval(parse(text = sprintf("require(\"%s\")", x)))
  } else { 
    # Update installed packages - this may mean a full upgrade of R, which in turn
    # may not be warrented. 
    #update.install.packages.auto(ask = FALSE) 
    eval(parse(text = sprintf("install.packages(\"%s\", dependencies = TRUE, repos = \"https://cloud.r-project.org/\")", x)))
  }
  if(isTRUE(x %in% .packages(all.available = TRUE))) { 
    eval(parse(text = sprintf("require(\"%s\")", x)))
  } else {
    if (!requireNamespace("BiocManager"))
      install.packages("BiocManager")
    BiocManager::install() # this would entail updating installed packages, which in turned may not be warrented

    eval(parse(text = sprintf("BiocManager::install(\"%s\")", x)))
    eval(parse(text = sprintf("require(\"%s\")", x)))
  }
}

cat("\n* Checking availability of required packages and installing if needed...\n\n")
### INSTALL PACKAGES WE NEED
install.packages.auto("optparse")
install.packages.auto("tools")
install.packages.auto("data.table")
install.packages.auto("qvalue") # Needed for multiple-testing correction

cat("\nDone! Required packages installed and loaded.\n\n")

cat("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

cat("\n* Setting colours...\n\n")
uithof_color = c("#FBB820","#F59D10","#E55738","#DB003F","#E35493","#D5267B",
                 "#CC0071","#A8448A","#9A3480","#8D5B9A","#705296","#686AA9",
                 "#6173AD","#4C81BF","#2F8BC9","#1290D9","#1396D8","#15A6C1",
                 "#5EB17F","#86B833","#C5D220","#9FC228","#78B113","#49A01D",
                 "#595A5C","#A2A3A4", "#D7D8D7", "#ECECEC", "#FFFFFF", "#000000")
#--------------------------------------------------------------------------

#--------------------------------------------------------------------------
### OPTION LISTING
help_text = paste0("
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
",SCRIPTNAME,"
",VERSION," - ",LASTEDITDATE," 

Description: 
Results parsing and quality control from QTLTools results using your data, CTMM (eQTL) or Athero-Express (mQTL) data. 
The script should be usuable on both any Linux distribution with R 3.5+ installed, Mac OS X and Windows.
    
NOTE 2018-06-15:
I've edited the eQTL-part (nom/perm for cis) to match with the new 'strand' column. 
What remains to be done:
- double check the trans-part as the column numbers have changed by the addition of the 'strand' column in the output.

Example command: 
Rscript QTL_QC.R --projectdir adir/somedir --resultsfile adir/somedir/qtl_nom.txt.gz --resulttype NOM --qtltype EQTL --outputdir adir/somedir --annotfile refdir/annotationfile.txt.gz --genstats adir/somedir/data_QC.stats


(C)1979-",THISYEAR," | ",AUTHOR,".
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
")
option_list = list(
  make_option(c("-p", "--projectdir"), action = "store", default = NA, type = 'character',
              help = "Path to the project directory, e.g. adir/somedir."),
  make_option(c("-r", "--resultfile"), action = "store", default = NA, type = 'character',
              help = "Location of the results file, including results filename, e.g. adir/somedir/qtl_nom.txt.gz."),
  make_option(c("-t", "--resulttype"), action = "store", default = NA, type = 'character',
              help = "The result type, either [NOM/PERM] for nominal or permutation results, respectively."),
  make_option(c("-q", "--qtltype"), action = "store", default = NA, type = 'character',
              help = "The quantitative trait locus (QTL) type, either [EQTL/MQTL] for expression or methylation QTL analyses, respectively."),
  make_option(c("-z", "--analysetype"), action = "store", default = NA, type = 'character',
              help = "Cis- or trans-QTL analyse, either [CIS/TRANS]."), 
  make_option(c("-o", "--outputdir"), action = "store", default = NA, type = 'character',
              help = "Path to the output directory, e.g. adir/somedir."),
  make_option(c("-a", "--annotfile"), action = "store", default = NA, type = 'character',
              help = "Path to the annotation file, e.g. refdir/annotationfile.txt.gz."),
  make_option(c("-j", "--genstats"), action = "store", default = NA, type = 'character',
              help = "Path to the summary statistics of the genotypes, e.g. adir/somedir/data_QC.stats."),
  make_option(c("-v", "--verbose"), action = "store_true", default = TRUE,
              help = "Should the program print extra stuff out? [default %default]"),
  make_option(c("-s", "--silent"), action = "store_false", dest = "verbose",
              help = "Make the program not be verbose.")
  #make_option(c("-h", "--help"), action="store_true", default=FALSE, 
  #             help="Show this help message and exit. \nAn example command would look like this: Rscript QTL_QC.R --projectdir adir/somedir --resultsfile adir/somedir/qtl_nom.txt.gz --resulttype NOM --qtltype EQTL --outputdir adir/somedir --annotfile refdir/annotationfile.txt.gz --genstats adir/somedir/data_QC.stats.")
  #make_option(c("-c", "--cvar"), action="store", default="this is c",
  #            help="a variable named c, with a default [default %default]")  
)
opt = parse_args(OptionParser(usage = help_text, option_list = option_list))

### OPTIONLIST | FOR LOCAL DEBUGGING
# opt$projectdir="/Users/swvanderlaan/PLINK/analyses/epigenetics/shearstress/shearstress_version_final/DEFAULT_qtl/"
# opt$resultfile="/Users/swvanderlaan/PLINK/analyses/epigenetics/shearstress/shearstress_version_final/DEFAULT_qtl/region_1_shearstress_version_final/aegs_QC_qtlperm_region_1_excl_DEFAULT.txt.gz"
# opt$resultfile="/Users/swvanderlaan/PLINK/analyses/epigenetics/shearstress/shearstress_version_final/DEFAULT_qtl/region_1_shearstress_version_final/aegs_QC_qtlnorm_region_1_excl_DEFAULT.txt.gz"
# opt$resulttype="PERM"
# opt$resulttype="NOM"
# opt$qtltype="MQTL"
# opt$outputdir="/Users/swvanderlaan/PLINK/analyses/epigenetics/shearstress/shearstress_version_final/DEFAULT_qtl/region_1_shearstress_version_final/"
# opt$annotfile="/Users/swvanderlaan/PLINK/_AE_Originals/IlluminaMethylation450K.annotation.txt.gz"
# opt$genstats="/Users/swvanderlaan/PLINK/analyses/epigenetics/shearstress/shearstress_version_final/DEFAULT_qtl/region_1_shearstress_version_final/aegs_1kGp3GoNL5_QC_region_1_excl_DEFAULT.stats"
# opt$analysetype="CIS"
### OPTIONLIST | FOR LOCAL DEBUGGING

if (opt$verbose) {
  # You can use either the long or short name; so opt$a and opt$avar are the same.
  # Show the user what the variables are.
  cat("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
  cat("* Checking the settings as given through the flags.")
  cat("\nThe project directory....................: ")
  cat(opt$projectdir)
  cat("\n\nThe results file.........................: ")
  cat(opt$resultfile)
  cat("\n\nThe output directory.....................: ")
  cat(opt$outputdir)
  cat("\n\nThe annotation file......................: ")
  cat(opt$annotfile)
  cat("\n\nThe results type.........................: ")
  cat(opt$resulttype)
  cat("\n\nThe QTL-type.............................: ")
  cat(opt$qtltype)
  cat("\n\nThe analysis type........................: ")
  cat(opt$analysetype)
  cat("\n\nThe variant summary statistics...........: ")
  cat(opt$genstats)
  cat("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
  cat("\n\n")
}
cat("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
cat("Wow, we are all set. Starting \"QTL Results Quality Control & Parser\".")
#--------------------------------------------------------------------------
### START OF THE PROGRAM
# main point of program is here, do this whether or not "verbose" is set
if (!is.na(opt$projectdir) & !is.na(opt$resultfile) & !is.na(opt$outputdir) & !is.na(opt$annotfile) & !is.na(opt$resulttype) & !is.na(opt$qtltype) & !is.na(opt$genstats)) {
  cat(paste("\nWe are going to make some graphs for quality control of you QTLtools analysis. \n\nAnalysing these results...............: '",file_path_sans_ext(basename(opt$resultfile), compression = TRUE),"'\nParsed results will be saved here.....: '", opt$outputdir, "'.\n",sep = ''))
  
  #--------------------------------------------------------------------------
  ### GENERAL SETUP
  Today = format(as.Date(as.POSIXlt(Sys.time())), "%Y%m%d")
  cat(paste("\nToday's date is: ", Today, ".\n", sep = ''))
  
  #--------------------------------------------------------------------------
  #### DEFINE THE LOCATIONS OF DATA
  ROOT_loc = opt$projectdir # argument 1
  OUT_loc = opt$outputdir # argument 4
  
  cat("\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
  #--------------------------------------------------------------------------
  ### LOADING ANNOTATION AND RESULTS FILES DEPENDING ON RESULT TYPE
  cat("\nLoading annotations...\n")
  ### Location of is set by 'opt$annotfile' # argument 5
  ### The type of the analysis will determine what to load 'opt$qtltype' # argument 4
  if (opt$qtltype == "EQTL") { 
    cat("\n...for a CTMM based eQTL analysis in monocytes...\n")
    ANNOTATIONSFILE = read.table(opt$annotfile, header = TRUE, stringsAsFactors = FALSE, sep = ",", na.strings = "")
    colnames(ANNOTATIONSFILE) = c("EntrezID", "ProbeID", "ArrayID", 
                                  "GeneName", "GeneInfo","Chr", "GeneTxStart", "GeneTxEnd")
  } else if (opt$qtltype == "MQTL") {
    cat("\n...for an Athero-Express based MQTL analysis...\n")
    ANNOTATIONSFILE = read.table(opt$annotfile, header = TRUE, stringsAsFactors = FALSE, sep = ",", na.strings = "")
    
    colnames(ANNOTATIONSFILE) = c("IlmnID", "ProbeID", 
                                  "AddressA_ID", "AlleleA_ProbeSeq", "AddressB_ID", "AlleleB_ProbeSeq", 
                                  "Infinium_Design_Type", "Next_Base", "Color_Channel", "Forward_Sequence", 
                                  "Genome_Build", "CHR", "MAPINFO", "SourceSeq", "Chromosome_36", "Coordinate_36", "Strand", 
                                  "Probe_SNPs", "Probe_SNPs_10", "Random_Loci", "Methyl27_Loci", 
                                  "UCSC_RefGene_Name", "UCSC_RefGene_Accession", "UCSC_RefGene_Group", "UCSC_CpG_Islands_Name", "Relation_to_UCSC_CpG_Island", 
                                  "Phantom", "DMR", "Enhancer", "HMM_Island", "Regulatory_Feature_Name", "Regulatory_Feature_Group", "DHS", 
                                  "UCSC_RefGene_Dist")
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\nLoading variant statistics...\n")
  VARIANTSTATS.RAW = read.table(opt$genstats, header = TRUE, stringsAsFactors = FALSE)
  cat("\n* calculating 'minor allele count' (MAC)...")
  # calculate MAC
  VARIANTSTATS.RAW$MAC <- (VARIANTSTATS.RAW[,19]*VARIANTSTATS.RAW[,18]*2)
  
  cat("\n* calculating 'coded allele frequency' (CAF)...")
  # calculate caf
  VARIANTSTATS.RAW$CAF <- (((2*VARIANTSTATS.RAW[,16]) + VARIANTSTATS.RAW[,15])/(VARIANTSTATS.RAW[,18]*2))
  
  cat("\n* determining which variants are solely 'imputed'...")
  # make imputation column
  if (opt$analysetype == "TRANS") {
    VARIANTSTATS.RAW$Imputation <- ifelse(VARIANTSTATS.RAW$alternate_ids == "---", 
                                          c("imputed"), c("genotyped")) 
  }
  if (opt$analysetype == "CIS") {
    VARIANTSTATS.RAW$Imputation <- ifelse(VARIANTSTATS.RAW$alternate_ids == "---", 
                                          c("imputed"), c("genotyped")) 
  }
  
  cat("\n* selecting required variant statistics data...")
  # Select the columns we need
  VARIANTSTATS = VARIANTSTATS.RAW[,c(2,3,4,5,6, # rsid (2) chromosome (3)  position (4) alleleA (5) alleleB (6)
                                     19,        # maf (19)
                                     23,        # mac, (23)
                                     24,        # caf, (24)
                                     8,9,21,18, # imputation quality (8, 9), and HWE (21), and N (18)
                                     25)]       # imputation, (25)
  
  # Change the column names
  colnames(VARIANTSTATS) = c("VARIANT", "Chr", "BP", "OtherAlleleA", "CodedAlleleA", 
                             "MAF", "MAC", "CAF", 
                             "AvgMAxPostCall", "Info", "HWE", "N", "Imputation")
  
  ### Loading main results
  RESULTS = read.table(opt$resultfile, header = FALSE, stringsAsFactors = FALSE)
  ### Loading *nominal* results 
  if (opt$resulttype == "NOM") { # argument 3
    cat("\n\nLoading data from 'nominal pass'...\n")
    
    if (opt$analysetype == "CIS") {
      # 1. The phenotype ID
      # 2. The chromosome ID of the phenotype
      # 3. The start position of the phenotype
      # 4. The end position of the phenotype
      # 5. The strand orientation of the phenotype
      # 6. The total number of variants tested in cis
      # 7. The distance between the phenotype and the tested variant (accounting for strand orientation)
      # 8. The ID of the tested variant
      # 9. The chromosome ID of the variant
      # 10. The start position of the variant
      # 11. The end position of the variant
      # 12. The nominal P-value of association between the variant and the phenotype
      # 13. The corresponding regression slope
      # 14. A binary flag equal to 1 is the variant is the top variant in cis
      RESULTS = RESULTS[ , c(1, 8, 7, 5, 12, 13)]
    }
    if (opt$analysetype == "TRANS") {
      # 1. Phenotype ID
      # 2. Phenotype chrID
      # 3. Phenotype start
      # 4. Variant ID
      # 5. Variant chrID
      # 6. Variant position
      # 7. Nominal P-value of association
      # 8. Dummy here. Field used in approximated mapping in trans
      # 9. Regression slope
      RESULTS = RESULTS[ , c(1, 4, 6, 5, 7, 9)]
    }
    colnames(RESULTS) = c("ProbeID", "VARIANT", "Distance_VARIANT_ProbeID", "Strand", "Nominal_P", "Beta")
    
    #--------------------------------------------------------------------------
    ### PLOTTING NOMINAL RESULTS
    cat("\nPlotting results...\n") 
    ## To check that the beta approximated permutation p-values are well estimated.
    pdf(paste0(opt$outputdir, "/",# map to the output directory
               ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day'
               file_path_sans_ext(basename(opt$resultfile), compression = TRUE), # get the basename file without the extension and any compression extensions
               "_histogram_nominal_beta.pdf"), onefile = TRUE)
    hist(RESULTS$Beta, 
         breaks = 10000,
         xlab = "Effect size", ylab = "Distribution", 
         main = "Overall distribution of effect size", 
         col = "#1290D9")
    abline(v = mean(RESULTS$Beta), col = "#E55738")
    abline(v = (mean(RESULTS$Beta) - 4*sd(RESULTS$Beta)), col = "#E55738", lty = 2)
    abline(v = (mean(RESULTS$Beta) + 4*sd(RESULTS$Beta)), col = "#E55738", lty = 2)
    dev.off()
    
  } else if (opt$resulttype == "PERM") { ### Loading *permutation* results 
    cat("\nLoading data from 'permutation pass'...\n")
    # old RESULTS = RESULTS[ , c(1, 4, 3, 7, 9)]
    # full pass
    if (opt$analysetype == "CIS") {
      # 1. The phenotype ID
      # 2. The chromosome ID of the phenotype
      # 3. The start position of the phenotype
      # 4. The end position of the phenotype
      # 5. The strand orientation of the phenotype
      # 6. The total number of variants tested in cis
      # 7. The distance between the phenotype and the tested variant (accounting for strand orientation)
      # 8. The ID of the top variant
      # 9. The chromosome ID of the top variant
      # 10. The start position of the top variant
      # 11. The end position of the top variant
      # 12. The number of degrees of freedom used to compute the P-values
      # 13. Dummy
      # 14. The first parameter value of the fitted beta distribution
      # 15. The second parameter value of the fitted beta distribution (it also gives the effective number of independent tests in the region)
      # 16. The nominal P-value of association between the phenotype and the top variant in cis
      # 17. The corresponding regression slope
      # 18. The P-value of association adjusted for the number of variants tested in cis given by the direct method (i.e. empirircal P-value)
      # 19. The P-value of association adjusted for the number of variants tested in cis given by the fitted beta distribution. We strongly recommend to use this adjusted P-value in any downstream analysis
      RESULTS = RESULTS[ , c(1, 6, 14, 15, 13, 8, 7, 5, 16, 17, 18, 19)]
    }
    if (opt$analysetype == "TRANS") {
      # nog geen idee hoe de permuted results van QTLTools eruit zien
      RESULTS = RESULTS[ , c(1, 6, 14, 15, 13, 8, 7, 5, 16, 17, 18, 19)]
    }  
    #RESULTS = read.table(opt$resultfile, head = FALSE, stringsAsFactors = FALSE)
    colnames(RESULTS) = c("ProbeID", "NVariants", "MLE_Beta_shape1", "MLE_Beta_shape2", "Dummy", 
                          "VARIANT", "Distance_VARIANT_ProbeID", "Strand", "Nominal_P", "Beta", "Perm_P", "Approx_Perm_P")
    
    #--------------------------------------------------------------------------
    ### PLOTTING PERMUTATION RESULTS
    pdf(paste0(opt$outputdir, "/",# map to the output directory
               ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day'
               file_path_sans_ext(basename(opt$resultfile), compression = TRUE), # get the basename file without the extension and any compression extensions
               "_comparing_permutation_pvalues.pdf"), onefile = TRUE)
    
    plot(RESULTS$Perm_P, RESULTS$Approx_Perm_P, 
         xlab = "Direct method", ylab = "Beta approximation", 
         main = "Comparing permuted p-values", bty = "n", 
         pch = 20, col = "#1290D9")
    abline(0, 1, col = "#E55738")
    hist(RESULTS$Beta, 
         breaks = 25,
         xlab = "Effect size", ylab = "Distribution", 
         main = "Overall distribution of effect size", 
         #bty = "n", 
         col = "#1290D9"
    )
    abline(v = mean(RESULTS$Beta), col = "#E55738")
    abline(v = (mean(RESULTS$Beta) - 4*sd(RESULTS$Beta)), col = "#E55738", lty = 2)
    abline(v = (mean(RESULTS$Beta) + 4*sd(RESULTS$Beta)), col = "#E55738", lty = 2)
    dev.off()
    
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  #--------------------------------------------------------------------------
  ### GET Z-SCORES, SD & SEM
  cat("\nGet Z-scores, sd and sem from p-values...\n")
  ### references:
  ###     - http://stats.stackexchange.com/questions/101136/how-can-i-find-a-z-score-from-a-p-value
  RESULTS$Z = qnorm(RESULTS$Nominal_P)
  
  ### Get standard deviation (SD)
  ### NOTE: incorrect formula -- it is not used in any way or form!!
  ### removing this line and variable means we have to figure out the column numbers
  ### downstream - major hassle.
  RESULTS$SD = (RESULTS$Beta - mean(RESULTS$Beta))/RESULTS$Z
  
  ### Get standard error of the mean (SEM)
  RESULTS$SEM = abs(RESULTS$Beta/RESULTS$Z)
  
  #--------------------------------------------------------------------------
  #### APPLY MULTIPLE TESTING CORRECTION ###
  cat("\nApplying multiple testing correction methods.\n")
  
  cat("\n* Conservative correction: Bonferroni correction...\n")
  ### Bonferroni correction - Conservative
  ### references:
  ###     - http://en.wikipedia.org/wiki/Bonferroni_correction
  ###     - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html
  if (opt$resulttype == "NOM") {
    RESULTS$Bonferroni = p.adjust(RESULTS$Nominal_P, method = "bonferroni")
  } else if (opt$resulttype == "PERM") {
    RESULTS$Bonferroni = p.adjust(RESULTS$Approx_Perm_P, method = "bonferroni")
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\n* Less conservative correction: Benjamini & Hochberg correction...\n")
  ### Benjamini & Hochberg correction - Less conservative
  ### references:
  ###     - http://en.wikipedia.org/wiki/False_discovery_rate
  ###     - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html
  if (opt$resulttype == "NOM") {
    RESULTS$BenjHoch = p.adjust(RESULTS$Nominal_P, method = "fdr")
  } else if (opt$resulttype == "PERM") {
    RESULTS$BenjHoch = p.adjust(RESULTS$Approx_Perm_P, method = "fdr")
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\n* Least conservative correction: Storey & Tibshirani correction...\n")
  ### Storey & Tibshirani correction - Least conservative
  ### references:
  ###     - http://en.wikipedia.org/wiki/False_discovery_rate
  ###     - http://svitsrv25.epfl.ch/R-doc/library/qvalue/html/qvalue.html
  ### Requires a bioconductor package: "qvalue"
  if(opt$resulttype == "NOM") {
    # RESULTS$Q = qvalue(RESULTS$Nominal_P)$qvalues # original code
    # RESULTS$Q = ifelse(RESULTS$Nominal_P > 0, qvalue(RESULTS$Nominal_P)$qvalues, "NA")
    RESULTS$Q = "Not_calculated._Throws_an_error_when_p-value_is_infinite_or_NA._NEED_FIXING"
    
  } else if(opt$resulttype == "PERM") {
    #print((RESULTS))
    # RESULTS$Q = qvalue(RESULTS$Approx_Perm_P)$qvalues # original code
    RESULTS$Q = ifelse(RESULTS$Approx_Perm_P > 0, qvalue(RESULTS$Approx_Perm_P)$qvalues, "NA")
    # RESULTS$Q = "Not_calculated._Throws_an_error_when_p-value_is_infinite_or_NA._NEED_FIXING"
    
  } else {
    cat ("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n",
         file=stderr()) # print error messages to stder
  }
  
  #--------------------------------------------------------------------------
  #### ADD IN THE ANNOTATIONS ###
  cat("\nApplying annotations.\n")
  cat("\n* First order based on Benjamini-Hochberg p-values...\n")
  RESULTS.toANNOTATE = RESULTS[order(RESULTS$BenjHoch),]
  
  cat("\n* Now annotating...\n")
  if (opt$qtltype == "EQTL") { 
    cat("\n...the results of a CTMM based eQTL analysis in monocytes.\n")
    RESULTS.toANNOTATE = cbind(RESULTS.toANNOTATE, ANNOTATIONSFILE[match(RESULTS.toANNOTATE[,1], ANNOTATIONSFILE$ProbeID ), 
                                                                   c("EntrezID","ArrayID", 
                                                                     "GeneName", "GeneInfo",
                                                                     "Chr", "GeneTxStart", "GeneTxEnd")])
    
  } else if (opt$qtltype == "MQTL") {
    cat("\n...the results of an Athero-Express based MQTL analysis.\n")
    RESULTS.toANNOTATE = cbind(RESULTS.toANNOTATE, ANNOTATIONSFILE[match(RESULTS.toANNOTATE[,1], ANNOTATIONSFILE$ProbeID ), 
                                                                   c("IlmnID", "ProbeID", 
                                                                     "AddressA_ID", "AlleleA_ProbeSeq", "AddressB_ID", "AlleleB_ProbeSeq", 
                                                                     "Infinium_Design_Type", "Next_Base", "Color_Channel", "Forward_Sequence", 
                                                                     "Genome_Build", "CHR", "MAPINFO", "SourceSeq", "Chromosome_36", "Coordinate_36", "Strand", 
                                                                     "Probe_SNPs", "Probe_SNPs_10", "Random_Loci", "Methyl27_Loci", 
                                                                     "UCSC_RefGene_Name", "UCSC_RefGene_Accession", "UCSC_RefGene_Group", "UCSC_CpG_Islands_Name", "Relation_to_UCSC_CpG_Island", 
                                                                     "Phantom", "DMR", "Enhancer", "HMM_Island", "Regulatory_Feature_Name", "Regulatory_Feature_Group", "DHS", 
                                                                     "UCSC_RefGene_Dist")])
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\n* Merging results, genetic stats, and annotations...\n")
  if (opt$resulttype == "NOM") {
    RESULTS.toANNOTATE2 = cbind(RESULTS.toANNOTATE, VARIANTSTATS[match(RESULTS.toANNOTATE[,2], VARIANTSTATS$VARIANT ),])
  } else if (opt$resulttype == "PERM") {
    RESULTS.toANNOTATE2 = cbind(RESULTS.toANNOTATE, VARIANTSTATS[match(RESULTS.toANNOTATE[,6], VARIANTSTATS$VARIANT ),])
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  if (opt$qtltype == "EQTL") { 
    cat("\n* Parsing annotated results for a CTMM eQTL analysis in monocytes...\n")
    if (opt$resulttype == "NOM") {
      cat("\n--- nominal results ---\n")
      #      print(head(RESULTS.toANNOTATE2))
      #      1 ProbeID 2 VARIANT 3 Distance_VARIANT_ProbeID 4 Strand 5 Nominal_P 
      #      6 Beta 7 Z 8 SD 9 SEM 10 Bonferroni 11 BenjHoch 12 Q 
      #      13 EntrezID 14 ArrayID 15 GeneName 16 GeneInfo 17 Chr 18 GeneTxStart 19 GeneTxEnd 
      #      20 VARIANT 21 Chr 22 BP 23 OtherAlleleA 24 CodedAlleleA 
      #      25 MAF 26 MAC 27 CAF 28 AvgMAxPostCall 29 Info 30 HWE 31 N 32 Imputation
      
      RESULTS.ANNOTATE = RESULTS.toANNOTATE2[,c(1,2,21,22,23,24,25,26,27,30,29,32,31, # Variant information
                                                15,13,3,4,17,18,19, # Gene information
                                                6,9,5,10,11,12)] # association statistics
    } else if (opt$resulttype == "PERM") {
      cat("\n--- permuted results ---\n")
      #      print(head(RESULTS.toANNOTATE2))
      #      1 ProbeID 2 NVariants 3 MLE_Beta_shape1 4 MLE_Beta_shape2 5 Dummy 
      #      6 VARIANT 7 Distance_VARIANT_ProbeID 8 Strand 9 Nominal_P 
      #      10 Beta 11 Perm_P 12 Approx_Perm_P 13 Z 14 SD 15 SEM 16 Bonferroni 17 BenjHoch 18 Q 
      #      19 EntrezID 20 ArrayID 21 GeneName/GeneName_UCSC 22 GeneInfo 23 Chr 24 GeneTxStart 25 GeneTxEnd 
      #      26 VARIANT 27 Chr 28 BP 29 OtherAlleleA 30 CodedAlleleA 
      #      31 MAF 32 MAC 33 CAF 34 AvgMAxPostCall 35 Info 36 HWE 37 N 38 Imputation
      
      RESULTS.ANNOTATE = RESULTS.toANNOTATE2[,c(1,6,27,28,29,30,31,32,33,36,35,38,37, # Variant information
                                                21,19,7,8,23,24,25, # Gene information
                                                10,15,9,11,12,16,17,18)] # association statistics
      #print(head(RESULTS.ANNOTATE))
    } else {
      cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
          file = stderr()) # print error messages to stder
    }
    
  } else if (opt$qtltype == "MQTL") {
    cat("\n* Parsing annotated results for an Athero-Express mQTL analysis...\n")
    if (opt$resulttype == "NOM") {
      cat("\n--- nominal results ---\n")
      #str(RESULTS.toANNOTATE2)
      RESULTS.ANNOTATE = RESULTS.toANNOTATE2[,c(1,2,48,49,50,51,52,53,54,57,56,59,58, # Variant information
                                                3,4,24,25,19, # CpG information
                                                34,35,36,38, # CpG associated information
                                                39,40,41,42,43,44,45, # CpG associated information
                                                6,9,5,10,11,12)] # association statistics
    } else if (opt$resulttype == "PERM") {
      cat("\n--- permuted results ---\n")
      RESULTS.ANNOTATE = RESULTS.toANNOTATE2[,c(1,6,54,55,56,57,58,59,60,63,62,65,64, # Variant information
                                                7,8,30,31,25, # CpG information
                                                40,41,42,44,45,46,47,48,49,50,51, # CpG associated information
                                                10,15,9,11,12,16,17,18)] # association statistics
    } else {
      cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
          file = stderr()) # print error messages to stder
    }
    
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  } 
  
  cat("\n* Remove duplicate gene names...\n")
  if (opt$qtltype == "EQTL") { 
    cat("\n...for results of a CTMM eQTL analysis in monocytes...\n")
    RESULTS.ANNOTATE[, "GeneName"] = as.character(lapply(RESULTS.toANNOTATE2[,"GeneName"], 
                                                         FUN = function(x){paste(unique(unlist(strsplit(x, split = ";"))), sep = "", collapse = ";")}))
  } else if (opt$qtltype == "MQTL") {
    cat("\n...for results of an Athero-Express mQTL analysis...\n")
    RESULTS.ANNOTATE[, "UCSC_RefGene_Name"] = as.character(lapply(RESULTS.toANNOTATE2[,"UCSC_RefGene_Name"], 
                                                                  FUN = function(x){paste(unique(unlist(strsplit(x, split = ";"))), sep = "", collapse = ";")}))
    RESULTS.ANNOTATE[, "UCSC_RefGene_Accession"] = as.character(lapply(RESULTS.toANNOTATE2[,"UCSC_RefGene_Accession"],
                                                                       FUN = function(x){paste(unique(unlist(strsplit(x, split = ";"))), sep = "", collapse = ";")}))
    RESULTS.ANNOTATE[, "UCSC_RefGene_Group"] = as.character(lapply(RESULTS.toANNOTATE2[,"UCSC_RefGene_Group"],
                                                                   FUN = function(x){paste(unique(unlist(strsplit(x, split = ";"))), sep = "", collapse = ";")}))
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\n* Correct Colnames and replace spaces in gene-names ...\n")
  if (opt$qtltype == "EQTL") { 
    cat("\n...for results of a CTMM eQTL analysis in monocytes ...\n")
    if (opt$resulttype == "NOM") {
      colnames(RESULTS.ANNOTATE) = c("ProbeID", "VARIANT", "Chr", "BP", "OtherAlleleB", "CodedAlleleA", "MAF", "MAC", "CAF", "HWE", "Info", "Imputation", "N", 
                                     "GeneName", "EntrezID", "Distance_VARIANT_GENE", "Strand", "Chr_Gene", "GeneTxStart", "GeneTxEnd",
                                     "Beta", "SE", "Nominal_P", "Bonferroni","BenjHoch","Q")
      RESULTS.ANNOTATE$GeneName <- gsub(" ", "_", RESULTS.ANNOTATE$GeneName)
    } else if (opt$resulttype == "PERM") {
      colnames(RESULTS.ANNOTATE) = c("ProbeID", "VARIANT", "Chr", "BP", "OtherAlleleB", "CodedAlleleA", "MAF", "MAC", "CAF", "HWE", "Info", "Imputation", "N", 
                                     "GeneName","EntrezID", "Distance_VARIANT_GENE", "Strand", "Chr_Gene", "GeneTxStart", "GeneTxEnd",
                                     "Beta", "SE", "Nominal_P","Perm_P","ApproxPerm_P", "Bonferroni","BenjHoch","Q")
      RESULTS.ANNOTATE$GeneName <- gsub(" ", "_", RESULTS.ANNOTATE$GeneName)
    } else {
      cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
          file = stderr()) # print error messages to stder
    }
    
  } else if (opt$qtltype == "MQTL") {
    cat("\n...for results of an Athero-Express mQTL analysis ...\n")
    if (opt$resulttype == "NOM") {
      colnames(RESULTS.ANNOTATE) = c("ProbeID", "VARIANT", "Chr", "BP", "OtherAlleleB", "CodedAlleleA", "MAF", "MAC", "CAF", "HWE", "Info", "Imputation", "N", 
                                     "Distance_VARIANT_CpG", "Strand", "Chr_CpG", "BP_CpG", "ProbeType", 
                                     "GeneName_UCSC", "AccessionID_UCSC", "GeneGroup_UCSC", "CpG_Island_Relation_UCSC", 
                                     "Phantom", "DMR", "Enhancer", "HMM_Island", "RegulatoryFeatureName", "RegulatoryFeatureGroup", "DHS",
                                     "Beta", "SE", "Nominal_P", "Bonferroni","BenjHoch","Q")
      RESULTS.ANNOTATE$GeneName_UCSC <- gsub(" ", "_", RESULTS.ANNOTATE$GeneName_UCSC)
    } else if (opt$resulttype == "PERM") {
      colnames(RESULTS.ANNOTATE) = c("ProbeID", "VARIANT", "Chr", "BP", "OtherAlleleB", "CodedAlleleA", "MAF", "MAC", "CAF", "HWE", "Info", "Imputation", "N", 
                                     "Distance_VARIANT_CpG", "Strand", "Chr_CpG", "BP_CpG", "ProbeType", 
                                     "GeneName_UCSC", "AccessionID_UCSC", "GeneGroup_UCSC", "CpG_Island_Relation_UCSC", 
                                     "Phantom", "DMR", "Enhancer", "HMM_Island", "RegulatoryFeatureName", "RegulatoryFeatureGroup", "DHS",
                                     "Beta", "SE", "Nominal_P","Perm_P","ApproxPerm_P", "Bonferroni","BenjHoch","Q")
      RESULTS.ANNOTATE$GeneName_UCSC <- gsub(" ", "_", RESULTS.ANNOTATE$GeneName_UCSC)
    } else {
      cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
          file = stderr()) # print error messages to stder
    }
    
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
  cat("\n* Remove temporary files...\n")
  rm(RESULTS.toANNOTATE, RESULTS.toANNOTATE2)
  
  #--------------------------------------------------------------------------
  ### SAVE NEW DATA ###
  cat("\n* Saving parsed data...\n")
  if (opt$resulttype == "NOM") {
    # write.table(RESULTS.ANNOTATE[which(RESULTS.ANNOTATE$BenjHoch <= 0.05), ], # with filtering on Q-value 
    write.table(RESULTS.ANNOTATE[which(RESULTS.ANNOTATE$BenjHoch != "NA"), ], # without filtering on Q-value
                #paste0(opt$outputdir, "/", 
                #       ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day'
                #       file_path_sans_ext(basename(opt$resultfile), compression = TRUE), 
                #       ".nominal.P0_05.txt"),
                paste0(opt$outputdir, "/", 
                       ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day'
                       file_path_sans_ext(basename(opt$resultfile), compression = TRUE), 
                       ".nominal.all.txt"),
                quote = FALSE , row.names = FALSE, col.names = TRUE, sep = ",", na = "NA", dec = ".")
  } else if (opt$resulttype == "PERM") {
    write.table(RESULTS.ANNOTATE[which(RESULTS.ANNOTATE$BenjHoch <= 0.05), ], # with filtering on Q-value 
                # write.table(RESULTS.ANNOTATE[which(RESULTS.ANNOTATE$BenjHoch != "NA"), ], # without filtering on Q-value
                paste0(opt$outputdir, "/", 
                       ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day'
                       file_path_sans_ext(basename(opt$resultfile), compression = TRUE), 
                       ".perm.Q0_05.txt"),
                #paste0(opt$outputdir, "/", 
                #        ###Today,"_", # add in Today's date -- removed as it causes issues in downstream projects when its the 'next day' 
                #        file_path_sans_ext(basename(opt$resultfile), compression = TRUE), 
                #       ".perm.all.txt"),
                quote = FALSE , row.names = FALSE, col.names = TRUE, sep = ",", na = "NA", dec = ".")
  } else {
    cat("\n\n*** ERROR *** Something is rotten in the City of Gotham; most likely a typo. Double back, please.\n\n", 
        file = stderr()) # print error messages to stder
  }
  
} else {
  cat("*** ERROR *** You didn't specify all variables:\n
      - --p/projectdir : path to project directory, e.g. adir/somedir.\n
      - --r/resultfile  : location of the results file, including results filename, e.g. adir/somedir/qtl_nom.txt.gz.\n
      - --t/resulttype : the results type (NOM for nominal; PERM for permutation).\n
      - --q/qtltype    : the QTL analysis type (EQTL for expression QTL; MQTL for methylation QTL).\n
      - --z/analysetype    : the analysis type, cis- or trans-QTL analyse ([CIS/TRANS]).\n
      - --o/outputdir  : path to output directory, e.g. adir/somedir.\n
      - --a/annotfile  : path to annotation file of genes, e.g. refdir/annotationfile.txt.gz.\n
      - --j/genstats   : path to summary statistics of variants, e.g. adir/somedir/data_QC.stats.\n\n", 
      file = stderr()) # print error messages to stderr
}
              
#--------------------------------------------------------------------------
### CLOSING MESSAGE
cat(paste("All done parsing QTLtools data on [",file_path_sans_ext(basename(opt$resultfile), compression = TRUE),"].\n"))
cat(paste("\nToday's: ",Today, "\n"))
cat("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

#--------------------------------------------------------------------------
### SAVE ENVIRONMENT | FOR DEBUGGING
# if(opt$resulttype == "NOM")
#   save.image(paste0(opt$outputdir, "/",Today,"_",file_path_sans_ext(basename(opt$resultfile), compression = TRUE),"_NOM_DEBUG_FastQTL_analysis.RData"))
# if(opt$resulttype == "PERM")
#   save.image(paste0(opt$outputdir, "/",Today,"_",file_path_sans_ext(basename(opt$resultfile), compression = TRUE),"_PERM_DEBUG_FastQTL_analysis.RData"))