From 91425e4a06288f9097a4c9029ab87eaa5fbe9a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A1rbara=20Bitarello?= Date: Thu, 18 Apr 2024 12:14:03 -0400 Subject: [PATCH] ncd1/2 --- R/ncd1.R | 195 +++++++++++++++++++++++++++++++++++++++++++ R/ncd2.R | 248 ++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 386 insertions(+), 57 deletions(-) diff --git a/R/ncd1.R b/R/ncd1.R index 99fe2a1..748873d 100644 --- a/R/ncd1.R +++ b/R/ncd1.R @@ -109,3 +109,198 @@ ncd1 <- function(x = x, return(res5) } +#' Calculate non-central statistic (NCD1) +#' +#' @param x A data.table object +#' @param tf Target frequency +#' @param fold Logical. If TRUE, NCD1 will use minor allele frequencies. +#' @param w Window size in bp. Default is 1000 +#' @param by.snp Logical. If TRUE, windows are defined around each SNP +#' in the input data. Else, slidding windows in the range first pos:last pos +#' will be used. +#' @param mid Logical. If TRUE runs NCD centered on a core SNP frequency instead of a pre-defined tf. Requires targetpos. +#' @param ncores Number of cores. Increasing this can spead things up for you. +#' Default is 4. +#'@param selectwin Select window. "val" returns the window with the lowest test. "maf" returns the window +#' which has the lowest difference between its central MAF and the tf."mid" returns the window +#' centered closest on a value provided by the user under targetpos."all" returns all windows (idea for when the user wants to select windows after scanning a genome) +#' @param targetpos Position of the target SNP. Only needed for mid==T +#' @param minIS Minimum number of informative sites. Default is 2. Windows with +#' less informative sites than this threshold are discarded. +#' @param label An optional label to include as the last column of the output. Defaults to value in selectwin. +#' @param verbose Logical. If TRUE, progress reports will be printed as the +#' function runs. +#' @return A data.table object +#' @export +#' +#' @examples ncd1(x=ncd1_input, selectwin="mid", targetpos=15000, mid=TRUE) +#' @import data.table +#' @importFrom data.table ":=" +ncd1 <- function(x = x, + tf = 0.5, + fold = T, + + w = 3000, + by.snp = TRUE, + mid = FALSE, + ncores = 2, + selectwin = "val", + targetpos = NULL, + minIS = 2, + label = NULL, + verbose = T) { + Win.ID <- + SegSites <- + POS <- V1 <- temp <- NCD1 <- CHR <- AF <- NULL + tx_1 <- + tn_1 <- + AF2 <- + tx_2 <- tn_2 <- ID <- SNP <- FD <- MAF <- NULL + tictoc::tic("Total runtime") + assertthat::assert_that(length(unique(x[, CHR])) == 1, msg = "Run one + chromosome at a time\n") + if (mid == TRUE) { + assertthat::assert_that(is.null(targetpos) == FALSE, msg = "NCD_mid requires a targetpos") + assertthat::assert_that(selectwin == "mid", msg = "If mid=T, selectwin must be=='mid'.") + } + if (selectwin == "mid") { + assertthat::assert_that(mid == TRUE, msg = "If selectwin=='mid', mid must be TRUE.") + assertthat::assert_that(mid == TRUE, msg = "NCD_mid requires a targetpos.") + } + + x[, AF := tx_1 / tn_1] + x[, ID := seq_along(CHR)] + w1 <- w / 2 + polpos <- x[AF != 1 & AF != 0]$ID + x[, SNP := ifelse(ID %in% polpos, T, F)] + x <- x[SNP == T] + x[, MAF := ifelse(AF > 0.5, 1 - AF, AF)] + mylist <- + parallel::mclapply(x$POS, function(y) { + x[POS >= y - w1 & + POS < y + w1][, .(POS, AF, ID, SNP, tx_1, MAF)] + }, + mc.cores = + ncores) + mylist <- + do.call(rbind, + parallel::mclapply(1:length(mylist), function(y) + mylist[[y]][, Mid := x$POS[y]][, Win.ID := + y], + mc.cores = ncores)) + mylist <- data.table::setDT(mylist) + # } + + if (mid == TRUE) { + mylist[, temp := abs(Mid - targetpos), by = Win.ID] + mylist <- mylist[temp == min(temp)] + mylist[, temp := NULL] + mylist[, tf := round(mylist[POS == Mid]$MAF[1], 2)] + + res <- + mylist[POS != Mid][, .(SegSites = sum(SNP), + IS = sum(SNP)), + by = Win.ID] + res[, tf := round(mylist[1, tf], 2)] + res1 <- dplyr::bind_cols(( + mylist %>% + dplyr::summarise( + MidMaf = MAF[which(Mid == POS)], + Mid = Mid[1], + Win.ID = Win.ID[1] + ) + ), + (mylist[POS != Mid] %>% dplyr::summarise(CenMaf = max( + abs(MAF - tf) + )))) %>% as.data.table #target SNP excluded + + res2 <- merge(res, res1) + res3 <- + mylist %>% dplyr::filter(SNP == T) %>% + dplyr::filter(POS != Mid) %>% #exclude target SNP + dplyr::summarise(temp2 = sum((MAF - tf) ^ 2), Win.ID = Win.ID[1]) %>% + as.data.table + } else{ + mylist[, tf := tf] + res <- + mylist[, .(SegSites = sum(SNP), + IS = sum(SNP)), + by = Win.ID] + res[, tf := tf] + res1 <- mylist %>% dplyr::group_by(Win.ID) %>% + dplyr::summarise( + MidMaf = MAF[which(Mid == POS)], + Mid = Mid[1], + CenMaf = max(abs(MAF - tf)), Win.ID = Win.ID) %>% + dplyr::ungroup() %>% + as.data.table + + res2 <- merge(res, res1) + res3 <- + mylist %>% dplyr::filter(SNP == T) %>% dplyr::group_by(Win.ID) %>% + dplyr::summarise(temp2 = sum((MAF - tf) ^ 2)) %>% dplyr::ungroup() %>% + as.data.table + } + + res4 <- merge(res2, res3) %>% as.data.table + res4 <- res4 %>% dplyr::ungroup() %>% as.data.table + res4[, NCD1 := sqrt(temp2 / IS), by = Win.ID] + res4[, temp2 := NULL] + + if (is.null(minIS) == "FALSE") { + res4 <- res4[IS >= minIS] + } + if (selectwin == "val") { + res4 <- + res4[which.min(res4$NCD1), ][, .(Win.ID, + SegSites, + IS, + CenMaf, + Mid, + MidMaf, + NCD1, + tf)] + } else if (selectwin == "maf") { + res4 <- + res4[which.min(res4$CenMaf), ][, .(Win.ID, + SegSites, + IS, + CenMaf, + Mid, + MidMaf, + NCD1, + tf)] + } else if (selectwin == "mid") { + res4 <- res4[which.min(res4$NCD1)][, .(Win.ID, + SegSites, + IS, + CenMaf, + Mid, + MidMaf, + NCD1, + tf)] + } else{ + res4 <- res4[, .(Win.ID, + SegSites, + IS, + CenMaf, + Mid, + MidMaf, + NCD1, + tf)] + } + + setnames(res4, + "NCD1", + ifelse(mid == TRUE, "NCD1_mid", glue::glue("NCD1_{tf}"))) + + if (is.null(label) == F) { + res4 <- res4[, Method := label] + } else{ + res4<-res4[, Method := paste(selectwin)] + } + if (verbose == T) { + tictoc::toc() + } + print(res4) +} diff --git a/R/ncd2.R b/R/ncd2.R index f26375d..fd74306 100644 --- a/R/ncd2.R +++ b/R/ncd2.R @@ -1,78 +1,212 @@ #' Calculate non-central statistic (NCD2) #' -#' @param x A data.table object with -#' columns: CHR, POS, REF, ALT, tx_1 (number of alternate allele copies), tn_1 (total number of alleles), tx_2, and tn_2. -#' @param tf Target frequency. Any value between 0 and 0.5. Default is 0.5. -#' @param w Window size in bp. Default is 1000. -#' @param ncores Number of cores. Increasing this can speed things up for you. -#' Default is 2. +#' @param x A data.table object +#' @param tf Target frequency +#' @param fold Logical. If TRUE, NCD2 will use minor allele frequencies. +#' @param w Window size in bp. Default is 1000 +#' @param by.snp Logical. If TRUE, windows are defined around each SNP +#' in the input data. Else, slidding windows in the range first pos:last pos +#' will be used. +#' @param mid Logical. If TRUE runs NCD centered on a core SNP frequency instead of a pre-defined tf. Requires targetpos. +#' @param ncores Number of cores. Increasing this can spead things up for you. +#' Default is 4. +#'@param selectwin Select window. "val" returns the window with the lowest test. "maf" returns the window +#' which has the lowest difference between its central MAF and the tf."mid" returns the window +#' centered closest on a value provided by the user under targetpos."all" returns all windows (idea for when the user wants to select windows after scanning a genome) +#' @param targetpos Position of the target SNP. Only needed for mid==T #' @param minIS Minimum number of informative sites. Default is 2. Windows with #' less informative sites than this threshold are discarded. -#' #' @param by Define how to scan the genome. "POS" (default) defined sliding windows based on w. "IS" defined windows around each informative site. -#' @return A data.table object with columns: TO DO +#' @param label An optional label to include as the last column of the output +#' @param verbose Logical. If TRUE, progress reports will be printed as the +#' function runs. +#' @return A data.table object #' @export #' -#' @examples ncd2(x=ncd2_input, tf=0.5, w=3000, ncores=2, minIS=8) +#' @examples ncd2(x=ncd2_input, selectwin="mid", targetpos=15000, mid=TRUE) +#' ncd2(x=ncd2_input, selectwin="val", targetpos=NULL, mid=F) #' @import data.table #' @importFrom data.table ":=" -#' -# to do: check that ncd2_input (internal object) is the same as the output of example command for ncd2 in the parse_vcf function. -ncd2 <- function(x = x, - tf = 0.5, - w = 3000, - ncores = 2, - minIS = 2, - by= "POS") { - - +ncd2 <- function( + x = x, + tf = 0.5, + fold = T, + w = 3000, + by.snp = TRUE, + mid = FALSE, + ncores = 2, + selectwin = "val", + targetpos = NULL, + minIS = 2, + label = NULL, + verbose = T) { + Win.ID <- SegSites <- POS <- V1 <- temp <- NCD2 <- CHR <- AF <- AF2 <- NULL + tx_1 <- tn_1 <- tx_1 <- tx_2 <- tn_2 <- ID <- SNP <- FD <- MAF <- NULL + tictoc::tic("Total runtime") assertthat::assert_that(length(unique(x[, CHR])) == 1, msg = "Run one chromosome at a time\n") + if (mid == TRUE) { + assertthat::assert_that(is.null(targetpos) == FALSE, msg = "NCD_mid requires a targetpos") + assertthat::assert_that(selectwin == "mid", msg = "If mid=T, selectwin must be=='mid'.") + } + if (selectwin == "mid") { + assertthat::assert_that(mid == TRUE, msg = "If selectwin=='mid', mid must be TRUE.") + assertthat::assert_that(mid == TRUE, msg = "NCD_mid requires a targetpos.") + } - x[, AF := tx_1 / tn_1] #allele relative frequency - x[, AF2 := tx_2 / tn_2] #allele relative frequency + x[, AF := tx_1 / tn_1] + x[, AF2 := tx_2 / tn_2] x[, ID := seq_along(CHR)] w1 <- w / 2 - polpos <- x[AF != 1 & AF != 0]$ID #select positions that are polymorphic - fdpos <- sort(c(x[AF == 1 & AF2 == 0]$ID, #fixed difference - x[AF == 0 & AF2 == 1]$ID)) #also fixed difference - #to do: check that the there is no intersection between polpos and fdpos. - x[, SNP := ifelse(ID %in% polpos, T, F)] #logical: True if SNP, False if not. - x[, FD := ifelse(ID %in% fdpos, T, F)] #logical: True if FD, False if not. + polpos <- x[AF != 1 & AF != 0]$ID + fdpos <- sort(c(x[AF == 1 & AF2 == 0]$ID, x[AF == 0 & AF2 == 1]$ID)) + x[, SNP := ifelse(ID %in% polpos, T, F)] + x[, FD := ifelse(ID %in% fdpos, T, F)] x[, MAF := ifelse(AF > 0.5, 1 - AF, AF)] - #################################################################################### - if(by=="POS"){ - #windows (sliding) - #to do: add some checks here - vec<-data.table(start=seq(from=x$POS[1], to=x$POS[nrow(x)], by=w1)) - vec[,end:=start+w] - setkey(vec, start, end) - x[,start:=POS][,end:=POS] + x2 <- x[SNP == T | FD == T] + x2[, ID := seq_along(CHR)] + polpos2<-x2[SNP==T]$ID + fdpos2<-x2[FD==T]$ID + # if(by.snp==T){ + mylist <- + parallel::mclapply(x2[polpos2, ]$POS, function(y) { + x2[POS >= y - w1 & + POS < y + w1][, .(POS, AF, AF2, ID, SNP, FD, tx_1, tx_2, MAF)] + }, + mc.cores = + ncores) + mylist <- + do.call(rbind, + parallel::mclapply(1:length(mylist), function(y) + mylist[[y]][, Mid := x2[polpos2,]$POS[y]][, Win.ID := + y], + mc.cores = ncores)) + mylist <- data.table::setDT(mylist) + # } + #################### + if (mid == TRUE) { + mylist[, temp := abs(Mid - targetpos), by = Win.ID] + mylist <- mylist[temp == min(temp)] + mylist[, temp := NULL] + mylist[, tf := round(mylist[POS == Mid]$MAF[1], 2)] - res_0<-foverlaps(x, vec, type="within")[, Win.ID:=paste0(CHR,"_",start,"_",end)][,.(POS, ID,SNP,FD,MAF, Win.ID)] - res_0<-res_0[SNP==T | FD==T][, tf:=tf] - res_1<-unique(res_0[,.(S = sum(SNP), - Subst = sum (FD), - IS = sum(SNP) + sum(FD), - tf = tf), - by= Win.ID]) + res <- + mylist[POS != Mid][, .(SegSites = sum(SNP), + FDs = sum(FD), + IS = sum(SNP)+sum(FD)), + by = Win.ID] + res[, tf := round(mylist[1, tf], 2)] + res1 <- dplyr::bind_cols(( + mylist %>% + dplyr::summarise( + MidMaf = MAF[which(Mid == POS)], + Mid = Mid[1], + Win.ID = Win.ID[1] + ) + ), + (mylist[POS != Mid] %>% dplyr::summarise(CenMaf = max( + abs(MAF - tf) + )))) %>% as.data.table #target SNP excluded - setkey(res_0, Win.ID) - setkey(res_1, Win.ID) - res_2<-res_0[res_1] - #to do: check that each Win.ID has the number of rows given by IS - res_3<-res_2[, .(ncd2 = sqrt(sum((MAF-tf)^2)/IS)), by=Win.ID] - res_3<-unique(res_3) - res_2<-res_2[,.(Win.ID, tf, S, Subst, IS)] - res_2<-unique(res_2) - res4 <- merge(res_2, res_3) %>% - dplyr::filter(IS >= minIS) %>% - dplyr::arrange(ncd2) %>% + res2 <- merge(res, res1) + res3 <- + mylist %>% dplyr::filter(SNP == T) %>% + dplyr::filter(POS != Mid) %>% #exclude target SNP + dplyr::summarise(temp2 = sum((MAF - tf) ^ 2), Win.ID = Win.ID[1]) %>% + as.data.table + } else{ + mylist[, tf := tf] + res <- + mylist[, .(SegSites = sum(SNP), + FDs = sum(FD), + IS = sum(SNP)+sum(FD)), + by = Win.ID] + res[, tf := tf] + res1 <- mylist %>% dplyr::group_by(Win.ID) %>% + dplyr::summarise( + MidMaf = MAF[which(Mid == POS)], + Mid = Mid[1], + CenMaf = max(abs(MAF - tf)), Win.ID = Win.ID) %>% + dplyr::ungroup() %>% as.data.table - }else if (by=="IS"){ + res2 <- merge(res, res1) + res3 <- + mylist %>% dplyr::filter(SNP == T) %>% dplyr::group_by(Win.ID) %>% + dplyr::summarise(temp2 = sum((MAF - tf) ^ 2)) %>% dplyr::ungroup() %>% + as.data.table + } -#to do: write this code. + #################### -} - return(res4) + res4 <- merge(res2, res3) %>% as.data.table + res4 <- res4 %>% dplyr::ungroup() %>% as.data.table + mini_fun<-function(x,y){ + sum((rep(0, x)-y)^2) + } + res4[, temp4 := mini_fun(unique(FDs),unique(tf)), by=Win.ID] + res4[, NCD2 := sqrt(((temp2 + temp4) / IS)), by = Win.ID] + res4[, temp2 := NULL] + res4[, temp4 := NULL] + if (is.null(minIS) == "FALSE") { + res4 <- res4[IS >= minIS] + } + if (selectwin == "val") { + res4 <- + res4[which.min(res4$NCD2), ][, .(Win.ID, + SegSites, + IS, + FDs, + CenMaf, + Mid, + MidMaf, + NCD2, + tf)] + + } else if (selectwin == "maf") { + res4 <- + res4[which.min(res4$CenMaf), ][, .(Win.ID, + SegSites, + IS, + FDs, + CenMaf, + Mid, + MidMaf, + NCD2, + tf)] + + } else if (selectwin == "mid") { + res4 <- res4[which.min(res4$NCD2)][, .(Win.ID, + SegSites, + IS, + FDs, + CenMaf, + Mid, + MidMaf, + NCD2, + tf)] + + } else{ + res4 <- res4[, .(Win.ID, + SegSites, + IS, + FDs, + CenMaf, + Mid, + MidMaf, + NCD2, + tf)] + } + + setnames(res4,"NCD2", + ifelse(mid == TRUE, "NCD2_mid", glue::glue("NCD2_{tf}"))) + + + + if (is.null(label) == F) { + res4 <- res4[, label := label] + } + if (verbose == T) { + tictoc::toc() + } + print(res4) }