Initial R package release (#1)

* basic file structure; R and macos gitignore; code and docs for feat_scale_norm.R * basic rmarkdown readme; r cmd check github action * assessments.R * build out example in readme * simplify readme table construction; add comments * kernel_weights.R * sl_screeners.R; import methods * sl_wrappers.R * learners.R; predict.R; updates * export function * to save on memory, don't retain sl lib fits * readme with fully worked example * tweak readme details * update r cmd check gha to only stop for errors * regen readme with taller plots * try to import a specific version of a pkg archived from cran * try installing from cran github instead * try installing lme4 dependencies manually * forgot to sudo * install lme4 via apt * try to force a newer version of lme4
saraemoore · Nov 7, 2023 · 7ae8e36 · 7ae8e36
1 parent f8358fe
commit 7ae8e36
Show file tree

Hide file tree

Showing 49 changed files with 3,213 additions and 9 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^\.github$
+^README.Rmd
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,47 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+name: R-CMD-check
+
+jobs:
+ R-CMD-check:
+ runs-on: ${{ matrix.config.os }}
+
+ name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - {os: windows-latest, r: 'release'}
+ - {os: ubuntu-latest, r: 'release'}
+
+ env:
+ GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+ R_KEEP_PKG_SOURCE: yes
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - uses: r-lib/actions/setup-pandoc@v2
+
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: ${{ matrix.config.r }}
+ http-user-agent: ${{ matrix.config.http-user-agent }}
+ use-public-rspm: true
+
+ - uses: r-lib/actions/setup-r-dependencies@v2
+ with:
+ extra-packages: any::rcmdcheck
+ needs: check
+ upgrade: 'TRUE'
+
+ - uses: r-lib/actions/check-r-package@v2
+ with:
+ upload-snapshots: true
+ error-on: '"error"'
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,11 @@
+### R ###
+
 # History files
 .Rhistory
 .Rapp.history
 
 # Session Data files
 .RData
-.RDataTmp
 
 # User-specific files
 .Ruserdata
@@ -39,11 +40,31 @@ vignettes/*.pdf
 # R Environment Variables
 .Renviron
 
-# pkgdown site
-docs/
+### macOS ###
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
 
-# translation temp files
-po/*~
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
 
-# RStudio Connect folder
-rsconnect/
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,38 @@
+Package: yall
+Title: Yet Another Local Learner (YALL)
+Version: 0.1.0
+Date: 2023-11-06
+Authors@R: 
+ person(given = "Sara",
+ family = "Moore",
+ role = c("aut", "cre"),
+ email = "[email protected]",
+ comment = c(ORCID = "0000-0001-7996-8003"))
+Description: A localized machine learning algorithm inspired by and built on Cross-Validated SuperLearner.
+Imports:
+ methods,
+ dplyr,
+ magrittr,
+ purrr,
+ tidyr,
+ tibble,
+ broom,
+ kedd,
+ scales,
+ SuperLearner,
+ SuperSelector,
+ SLScreenExtra,
+ origami,
+ cvAUC,
+ ROCR,
+ yardstick,
+ future.apply,
+ strip
+Remotes:
+ saraemoore/SuperSelector,
+ saraemoore/SLScreenExtra,
+ cran/kedd
+License: GPL-3
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,60 @@
+# Generated by roxygen2: do not edit by hand
+
+export(GridLearner)
+export(SL.glm.mean)
+export(assess_predictions)
+export(distKernelWeights)
+export(predictAllYALL)
+export(screen.corRank2.wgtd)
+export(screen.corRank3.wgtd)
+export(screen.corRank4.wgtd)
+export(screen.corRank5.wgtd)
+export(screen.corRank6.wgtd)
+export(summarizeYALL)
+export(trainYALL)
+export(trainYALLfold)
+import(SLScreenExtra)
+import(SuperLearner)
+importFrom(ROCR,performance)
+importFrom(ROCR,prediction)
+importFrom(SLScreenExtra,screen.wgtd.corRank)
+importFrom(SuperSelector,cvSLFeatureSelector)
+importFrom(SuperSelector,factor_to_indicator)
+importFrom(SuperSelector,groupBySelectionSet)
+importFrom(SuperSelector,makeLibArgNumeric)
+importFrom(broom,tidy)
+importFrom(cvAUC,ci.cvAUC)
+importFrom(dplyr,arrange)
+importFrom(dplyr,bind_cols)
+importFrom(dplyr,bind_rows)
+importFrom(dplyr,distinct)
+importFrom(dplyr,do)
+importFrom(dplyr,full_join)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,mutate_all)
+importFrom(dplyr,n)
+importFrom(dplyr,one_of)
+importFrom(dplyr,select)
+importFrom(dplyr,slice)
+importFrom(dplyr,summarize)
+importFrom(dplyr,transmute)
+importFrom(dplyr,ungroup)
+importFrom(future.apply,future_lapply)
+importFrom(kedd,kernel.fun)
+importFrom(magrittr,`%>%`)
+importFrom(methods,as)
+importFrom(origami,training)
+importFrom(origami,validation)
+importFrom(purrr,map)
+importFrom(purrr,map2)
+importFrom(scales,percent)
+importFrom(stats,gaussian)
+importFrom(stats,glm)
+importFrom(stats,predict)
+importFrom(stats,setNames)
+importFrom(strip,strip)
+importFrom(tibble,as_tibble)
+importFrom(tibble,tibble)
+importFrom(tidyr,unnest)
+importFrom(yardstick,pr_auc_vec)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# yall 0.1.0
+
+Initial version.
diff --git a/R/assessments.R b/R/assessments.R
@@ -0,0 +1,49 @@
+#' Cross-validated AUC estimate with confidence interval as a formatted string
+#' 
+#' @param yhat A vector of predicted outcome values
+#' @param y A vector of observed outcome values
+#' @param folds Vector of fold IDs. If \code{NULL}, the standard error will be
+#' estimated as though there were a single fold (no cross-validation).
+#' @param confidence Confidence level; defaults to \code{0.95}
+#' @return A character string
+#' @importFrom cvAUC ci.cvAUC
+pretty_cvauc_ci <- function(yhat, y, folds = NULL, confidence = 0.95) {
+ res_cvauc <- cvAUC::ci.cvAUC(predictions = yhat, labels = y,
+ folds = folds, confidence = confidence)
+ sprintf("AUC = %.3f (%.3f, %.3f)", res_cvauc$cvAUC, res_cvauc$ci[1], res_cvauc$ci[2])
+}
+
+#' Prepare data.frames for multiple model assessment visualizations
+#' 
+#' @param yhat A vector of predicted outcome values
+#' @param y A vector of observed outcome values
+#' @param yhat_label Optional label to describe the model output. Useful when
+#' multiple models will be compared in a single visualization. Defaults to
+#' "Predictions".
+#' @return A named list containing 4 \code{data.frame}s: "res" (containing
+#' predicted and observed outcome values), "roc" (ROC curve), "pr"
+#' (precision-recall curve), and "acc" (accuracy curve).
+#' @importFrom yardstick pr_auc_vec
+#' @importFrom ROCR prediction performance
+#' @export
+assess_predictions <- function(yhat, y, yhat_label = "Predictions") {
+ auc_res <- pretty_cvauc_ci(yhat, y)
+
+ aupr_res <- yardstick::pr_auc_vec(factor(y, levels = c(1, 0)), yhat, event_level = "first")
+
+ roc_pred <- ROCR::prediction(yhat, y)
+ roc_res <- ROCR::performance(roc_pred, measure = "tpr", x.measure = "fpr")
+ prec_res <- ROCR::performance(roc_pred, measure = "prec", x.measure = "rec")
+ acc_res <- ROCR::performance(roc_pred, measure = "acc")
+
+ roc_auc_df <- data.frame(fpr = roc_res@x.values[[1]], tpr=roc_res@y.values[[1]],
+ auc.details = auc_res)
+ prec_rec_df <- data.frame(rec = prec_res@x.values[[1]], prec = prec_res@y.values[[1]],
+ auc.details = aupr_res)
+ acc_df <- data.frame(cutoff = acc_res@x.values[[1]], acc = acc_res@y.values[[1]])
+
+ roc_auc_df$method = paste(yhat_label, auc_res, sep = "\n")
+ prec_rec_df$method = paste(yhat_label, paste("AUCPR =", round(aupr_res, 3)), sep = "\n")
+
+ return(list(res = data.frame(y = y, yhat = yhat), roc = roc_auc_df, pr = prec_rec_df, acc = acc_df))
+}
diff --git a/R/feat_scale_norm.R b/R/feat_scale_norm.R
@@ -0,0 +1,123 @@
+#' Calculate the magnitude of a vector
+#' 
+#' @param x a numeric vector
+#' @param f a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @return a numeric value
+vectorNorm = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
+ f = match.arg(f)
+ switch(as.character(f),
+ L1 = sum(abs(x)),
+ L2 = sqrt(sum(x^2)),
+ L3 = sum(abs(x)^3)^(1/3),
+ L4 = sum(x^4)^(1/4),
+ L5 = sum(abs(x)^5)^(1/5),
+ L6 = sum(x^6)^(1/6),
+ max = max(abs(x))
+ # TODO: add Mahalanobis distance? might not work
+ )
+}
+
+#' Calculate the magnitude of row-vectors
+#' 
+#' @param x A \code{data.frame} or \code{matrix}
+#' @param f a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @return a numeric vector
+rowNorms = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
+ f = match.arg(f)
+ switch(as.character(f),
+ L1 = rowSums(abs(x)),
+ L2 = sqrt(rowSums(x^2)),
+ L3 = rowSums(abs(x)^3)^(1/3),
+ L4 = rowSums(x^4)^(1/4),
+ L5 = rowSums(abs(x)^5)^(1/5),
+ L6 = rowSums(x^6)^(1/6),
+ max = apply(abs(x), 1, max)
+ # TODO: add Mahalanobis distance? might not work
+ )
+ # canberra aka weighted L_1 distance: not useful here b/c centroid is 0 for all features
+ # One-minus-Pearson-correlation: also not useful here b/c centroid is 0 for all features
+}
+
+#' Normalize a dataset to a single observation
+#'
+#' Normalize a dataset \code{X} to a single observation \code{obs} by
+#' subtracting the observation's values and dividing by the L1 norm.
+#' 
+#' @param obs A \code{data.frame} containing a single record/row (typically a
+#' validation/test set observation) to which \code{X} will be normalized. 
+#' @param X A \code{data.frame} (typically containing the training set) to be
+#' normalized.
+#' @param keepX Character vector containing names of retained features.
+#' @return A \code{data.frame}
+#' @importFrom dplyr bind_rows select one_of mutate_all slice n
+#' @importFrom magrittr `%>%`
+normalizeFeatures <- function(obs, X, keepX = colnames(X)) {
+ bind_rows(obs, X) %>%
+ # only keep columns of X in keepX
+ select(one_of(keepX)) %>%
+ # center each column by its value in obs
+ mutate_all(list(~ . - .[1])) %>%
+ # scale each column by its L1 norm.
+ # could instead do norm(as.matrix(.), "1"), but sum(abs(.)) is faster.
+ # when a column is completely homogeneous, this step produces all NaNs:
+ mutate_all(list(~ ./vectorNorm(.))) %>%
+ # remove obs that was prepended
+ slice(2:n()) %>%
+ # tibbles are trouble
+ as.data.frame()
+}
+
+#' Estimate the record-level distance from \code{validX} to \code{trainX}
+#'
+#' @param normF a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @param validX A \code{matrix}, \code{data.frame}, or numeric vector
+#' containing one or more observations to which \code{trainX} should be
+#' normalized (typically a validation/test set).
+#' @param trainX A \code{data.frame} (typically containing the training set) to
+#' be normalized.
+#' @param keepX Character vector containing names of retained features.
+#' @param verbose Currently unused. A boolean indicating whether diagnostic
+#' messages should be printed. Defaults to \code{FALSE}.
+#' @return A \code{matrix}
+distByRows <- function(normF = c("L1", "L2", "L3", "L4", "L5", "L6", "max"),
+ validX, trainX, keepX, verbose = FALSE) {
+ normF = match.arg(normF)
+
+ if(is.vector(validX)|is.matrix(validX)) {
+ if(is.vector(validX)) {
+ validX = as.data.frame(t(validX))
+ }
+ colnames(validX) = colnames(trainX)
+ }
+
+ # for every validation set observation:
+ newTrainX = apply(validX, 1, normalizeFeatures, trainX, keepX)
+ # now we have a list with length = number of rows in validX
+ # each element is a new trainX data.frame tailored to that validation set observation
+
+ # sapply(norms, function(d) t(sapply(newTrainX, rowNorms, d)), simplify = FALSE)
+ res = sapply(newTrainX, rowNorms, normF)
+ # TODO: parallelize
+ return(t(res))
+}