Skip to content

Commit

Permalink
Merge branch 'master' into daphne-dev
Browse files Browse the repository at this point in the history
  • Loading branch information
bbitarello authored Apr 18, 2024
2 parents f9aebb2 + 5ca793c commit d60c89f
Show file tree
Hide file tree
Showing 41 changed files with 5,601 additions and 1,361 deletions.
2 changes: 2 additions & 0 deletions .Renviron
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
RENV_CONFIG_WATCHDOG_ENABLED = FALSE
R_BUILD_TAR=tar
40 changes: 40 additions & 0 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
#
# See https://github.com/r-lib/actions/tree/master/examples#readme for
# additional example workflows available for the R community.

name: R

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

permissions:
contents: read

jobs:
build:
runs-on: macos-latest
strategy:
matrix:
r-version: ['3.6.3', '4.1.1']

steps:
- uses: actions/checkout@v3
- name: Set up R ${{ matrix.r-version }}
uses: r-lib/actions/setup-r@f57f1301a053485946083d7a45022b278929a78a
with:
r-version: ${{ matrix.r-version }}
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
remotes::install_deps(dependencies = TRUE)
shell: Rscript {0}
- name: Check
run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error")
shell: Rscript {0}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@
.Rdata
.httr-oauth
.DS_Store
local_only/
inst/doc
13 changes: 8 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ Imports:
dplyr,
glue,
magrittr,
reticulate,
slendr,
stringr,
testthat,
tictoc
stringr
Depends:
R (>= 2.10)
Config/testthat/edition: 3
Suggests:
knitr,
rmarkdown,
testthat (>= 3.1.9),
withr
Config/testthat/edition: 3
VignetteBuilder: knitr
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(modmod_read_vcf)
export(ncd1)
export(ncd2)
export(outfile_path)
Expand Down
25 changes: 13 additions & 12 deletions R/count_alleles.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#' Count derived or ancestral alleles for a given vcf position
#' @param x A vector containing 0s and 1s corresponding to allele states
#' @param der Return derived. Logical. If TRUE, returns the counts of
#' derived (alternate)
#' alleles. If FALSE, returns ancestral (ref) counts
#' @param x A vector containing 0s and 1s corresponding to allele states. 0/0: homozygous reference; 0/1: heterozygous; 1/1: homozygous alternate

#' @param split Pattern to look for. Inherited from stringr::str_split
#' @param index 1 or 2 or c(1,2). Which alleles to consider.
#' @examples data.table::data.table(col1="1|1", col2="1|0", col3="0|0") %>%
#' dplyr::summarise(across(col1:col3, .count_alleles))
.count_alleles<-function(x, split="|",index=c(1,2), der = F){
x<-.split_geno(x = x, split = split, index = index)
if(der == T){sum(x, na.rm=T)}else if (der == F){sum(!is.na(x))-sum(x, na.rm=T)}


.count_alleles<-function(x, split="|"){
x<-.split_geno(x = x, split = split)
#if(der == T){sum(x, na.rm=T)}else if (der == F){sum(!is.na(x))-sum(x, na.rm=T)}
#sum(!is.na(x))-sum(x, na.rm=T)
#ref<-sum(x==0, na.rm=T) #number of ref alleles
sum(x==1, na.rm=T) #number of alt alleles

}
.count_nonmissing<-function(x, split="|",index=c(1,2)){
x<-.split_geno(x = x, split = split, index = index)
.count_nonmissing<-function(x, split="|"){
x<-.split_geno(x = x, split = split)
sum(!is.na(x))
}
39 changes: 39 additions & 0 deletions R/modified_read_vcf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' Read vcf
#'
#' @param x The path and name of a vcf file
#' @return Returns a data.table object containing only SNPs
#' @export
#'
#' @examples read_vcf(x=system.file(package="balselr", "example.vcf"))
#' @import data.table
#' @importFrom data.table ":="
#'
modmod_read_vcf <- function(x = "inst/example.vcf", pos.range = NULL, id.range = NULL) {
inp <- data.table::fread(x, skip = "##", header = TRUE)
data.table::setnames(inp, "#CHROM", "CHR")

inp <- inp[REF %in% c("A", "C", "T", "G") & ALT %in% c("A", "C", "T", "G")]

if (!is.null(pos.range)) {
if (length(pos.range) == 2) {
inp <- inp[POS >= pos.range[1] & POS <= pos.range[2]]
} else {
inp <- inp[POS %in% pos.range]
}
}

col.range <- 1:9

if (!is.null(id.range)) {
if (length(id.range) == 2) {
col.range <- c(col.range, (10 + id.range[1] - 1):(10 + id.range[2] - 1))
} else {
col.range <- c(col.range, 9 + id.range)
}
col.range <- col.range[col.range <= ncol(inp)]
}

inp <- inp[, .SD, .SDcols = col.range]

return(inp)
}
125 changes: 118 additions & 7 deletions R/ncd1.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,114 @@
#' Calculate non-central statistic (NCD1)
#'
#' @param x A data.table object with
#' columns: CHR, POS, REF, ALT, tx_1 (number of alternate allele copies),
#' tn_1 (total number of alleles).
#' @param tf Target frequency.
#' @param w Window size in bp. Default is 3000.
#' @param ncores Number of cores. Increasing this can speed things up for you.
#' Default is 2.
#' @param minIS Minimum number of informative sites. Default is 2. Windows with
#' less informative sites than this threshold are discarded.
#' @param by Define how to scan the genome. "POS" (default) defined sliding windows based on w. "IS" defined windows around each informative site.
#' @return A data.table object with columns: Win.ID, S (sites), IS (informative sites), tf, ncd1
#' @export
#'
#' @examples ncd1(x=ncd1_input, tf=0.5, w=3000, ncores=2, minIS=8)
#' @import data.table
#' @importFrom data.table ":="
#'
# to do: check that ncd1_input (internal object) is the same as the output of example command for ncd1 in the parse_vcf function.
# something like ncd1(x=parse_vcf(infile=system.file(package="balselr", "example.vcf"), n0=108, type="ncd1"), tf=0.5, w=3000, ncores=2, minIS=8) == ncd1(x=ncd1_input, tf=0.5, w=3000, ncores=2, minIS=8)
# do the equivalent thing for ncd2
ncd1 <- function(x = x,
tf = 0.5,
w = 3000,
ncores = 2,
minIS = 2,
by="POS") {
assertthat::assert_that(length(unique(x[, CHR])) == 1, msg = "Run one
chromosome at a time\n")

x[, AF := tx_1 / tn_1] #allele relative frequency
x[, ID := seq_along(CHR)]
w1 <- w / 2
polpos <-
x[AF != 1 & AF != 0]$ID #select positions that are polymorphic
x[, SNP := ifelse(ID %in% polpos, T, F)] #logical: True if SNP, False if not.
x[, MAF := ifelse(AF > 0.5, 1 - AF, AF)]
####################################################################################
if(by=="POS"){
#windows (sliding)
#to do: add some checks here
vec<-data.table(start=seq(from=x$POS[1], to=x$POS[nrow(x)], by=w1))
vec[,end:=start+w]
setkey(vec, start, end)
x[,POS2:=POS]
setnames(x, c("POS","POS2"),c("start","end"))
res_0<-foverlaps(x, vec, type="within")[, Win.ID:=paste0(CHR,"_",start,"_",end)][, POS:=start][,.(POS,AF, ID,SNP,MAF, Win.ID)]
res_0<-res_0[SNP==T][, tf:=tf]
res_1<-unique(res_0[,.(S = sum(SNP),
IS = sum(SNP),
tf = tf),
by= Win.ID])
res_2<-res_0[, .(temp2 = sum((MAF - tf) ^ 2)), by=Win.ID]
res4 <- merge(res_1, res_2) %>%
dplyr::filter(IS >= minIS) %>%
as.data.table
res4<-res4[, ncd1:=sqrt((temp2)/IS)][,temp2:=NULL] %>% dplyr::arrange(Win.ID) %>% as.data.table

} else if(by=="IS"){
###################################
#windows centered on SNPs
x2 <- x[SNP == T] #select only polymorphic positions
mylist <-
parallel::mclapply(x2$POS, function(y) {
x2[,start:=y-w1][,end:=y+w1][POS >= start &
POS < end][, Mid:=y][,Win.ID:=paste0(CHR,"_",start,"_",end)][, tf:=tf][, .(POS, AF, ID, SNP, MAF, Mid, Win.ID, tf)]
},
mc.cores =
ncores) #creates list where each element is a genomic window

mylist2 <-
do.call(rbind, mylist)

#remove windows where Win.ID includes negative numbers
mylist2<-mylist2[-grep("-",mylist2$Win.ID),]
#remove window where mid==last POS in dataset
mylist2<-mylist2[Mid!=mylist2$POS[nrow(mylist2)]]
#to do: check that mylist2 is a data table
res <-
unique(mylist2[, .(S = sum(SNP),
IS = sum(SNP),
tf = tf),
by = Win.ID])
#to do: add some checks here
res1 <- unique(as.data.table(mylist2[, .(MidMaf = MAF[which(Mid == POS)],
Mid = Mid), by=Win.ID]))
# TopMaf = which(min(abs(MAF - tf)), Win.ID = Win.ID) %>%

#to do: add some checks here
res2 <- merge(res, res1)
#to do: add some checks here
res3 <-
mylist2[, .(temp2 = sum((MAF - tf) ^ 2)), by=Win.ID]


res4 <- merge(res2, res3) %>%
dplyr::mutate(ncd1:=sqrt((temp2)/IS)) %>%
dplyr::filter(IS >= minIS) %>%
dplyr::select(Win.ID, S, IS, tf, MidMaf, Mid) %>%
dplyr::arrange(ncd1) %>%
as.data.table

}
split_ids <- lapply(strsplit(res4$Win.ID, "_"), as.numeric)
sort_values <- sapply(split_ids, function(x) x[2])
res5 <- res4[order(sort_values), ]

return(res5)
}

#' Calculate non-central statistic (NCD1)
#'
#' @param x A data.table object
Expand Down Expand Up @@ -161,13 +272,13 @@ ncd1 <- function(x = x,
tf)]
} else if (selectwin == "mid") {
res4 <- res4[which.min(res4$NCD1)][, .(Win.ID,
SegSites,
IS,
CenMaf,
Mid,
MidMaf,
NCD1,
tf)]
SegSites,
IS,
CenMaf,
Mid,
MidMaf,
NCD1,
tf)]
} else{
res4 <- res4[, .(Win.ID,
SegSites,
Expand Down
46 changes: 23 additions & 23 deletions R/ncd2.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@
#' @import data.table
#' @importFrom data.table ":="
ncd2 <- function(
x = x,
tf = 0.5,
fold = T,
w = 3000,
by.snp = TRUE,
mid = FALSE,
ncores = 2,
selectwin = "val",
targetpos = NULL,
minIS = 2,
label = NULL,
verbose = T) {
x = x,
tf = 0.5,
fold = T,
w = 3000,
by.snp = TRUE,
mid = FALSE,
ncores = 2,
selectwin = "val",
targetpos = NULL,
minIS = 2,
label = NULL,
verbose = T) {
Win.ID <- SegSites <- POS <- V1 <- temp <- NCD2 <- CHR <- AF <- AF2 <- NULL
tx_1 <- tn_1 <- tx_1 <- tx_2 <- tn_2 <- ID <- SNP <- FD <- MAF <- NULL
tictoc::tic("Total runtime")
Expand Down Expand Up @@ -70,15 +70,15 @@ ncd2 <- function(
mylist <-
parallel::mclapply(x2[polpos2, ]$POS, function(y) {
x2[POS >= y - w1 &
POS < y + w1][, .(POS, AF, AF2, ID, SNP, FD, tx_1, tx_2, MAF)]
POS < y + w1][, .(POS, AF, AF2, ID, SNP, FD, tx_1, tx_2, MAF)]
},
mc.cores =
ncores)
mylist <-
do.call(rbind,
parallel::mclapply(1:length(mylist), function(y)
mylist[[y]][, Mid := x2[polpos2,]$POS[y]][, Win.ID :=
y],
y],
mc.cores = ncores))
mylist <- data.table::setDT(mylist)
# }
Expand Down Expand Up @@ -141,7 +141,7 @@ ncd2 <- function(
res4 <- merge(res2, res3) %>% as.data.table
res4 <- res4 %>% dplyr::ungroup() %>% as.data.table
mini_fun<-function(x,y){
sum((rep(0, x)-y)^2)
sum((rep(0, x)-y)^2)
}
res4[, temp4 := mini_fun(unique(FDs),unique(tf)), by=Win.ID]
res4[, NCD2 := sqrt(((temp2 + temp4) / IS)), by = Win.ID]
Expand Down Expand Up @@ -176,14 +176,14 @@ ncd2 <- function(

} else if (selectwin == "mid") {
res4 <- res4[which.min(res4$NCD2)][, .(Win.ID,
SegSites,
IS,
FDs,
CenMaf,
Mid,
MidMaf,
NCD2,
tf)]
SegSites,
IS,
FDs,
CenMaf,
Mid,
MidMaf,
NCD2,
tf)]

} else{
res4 <- res4[, .(Win.ID,
Expand Down
Loading

0 comments on commit d60c89f

Please sign in to comment.