Skip to content

Commit

Permalink
Initial R package release (#1)
Browse files Browse the repository at this point in the history
* basic file structure; R and macos gitignore; code and docs for feat_scale_norm.R

* basic rmarkdown readme; r cmd check github action

* assessments.R

* build out example in readme

* simplify readme table construction; add comments

* kernel_weights.R

* sl_screeners.R; import methods

* sl_wrappers.R

* learners.R; predict.R; updates

* export function

* to save on memory, don't retain sl lib fits

* readme with fully worked example

* tweak readme details

* update r cmd check gha to only stop for errors

* regen readme with taller plots

* try to import a specific version of a pkg archived from cran

* try installing from cran github instead

* try installing lme4 dependencies manually

* forgot to sudo

* install lme4 via apt

* try to force a newer version of lme4
  • Loading branch information
saraemoore committed Nov 7, 2023
1 parent f8358fe commit 7ae8e36
Show file tree
Hide file tree
Showing 49 changed files with 3,213 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^\.github$
^README.Rmd
1 change: 1 addition & 0 deletions .github/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.html
47 changes: 47 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
on:
push:
branches: [main]
pull_request:
branches: [main]

name: R-CMD-check

jobs:
R-CMD-check:
runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
fail-fast: false
matrix:
config:
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'release'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check
upgrade: 'TRUE'

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
error-on: '"error"'
35 changes: 28 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
### R ###

# History files
.Rhistory
.Rapp.history

# Session Data files
.RData
.RDataTmp

# User-specific files
.Ruserdata
Expand Down Expand Up @@ -39,11 +40,31 @@ vignettes/*.pdf
# R Environment Variables
.Renviron

# pkgdown site
docs/
### macOS ###

# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# translation temp files
po/*~
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# RStudio Connect folder
rsconnect/
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
38 changes: 38 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Package: yall
Title: Yet Another Local Learner (YALL)
Version: 0.1.0
Date: 2023-11-06
Authors@R:
person(given = "Sara",
family = "Moore",
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0001-7996-8003"))
Description: A localized machine learning algorithm inspired by and built on Cross-Validated SuperLearner.
Imports:
methods,
dplyr,
magrittr,
purrr,
tidyr,
tibble,
broom,
kedd,
scales,
SuperLearner,
SuperSelector,
SLScreenExtra,
origami,
cvAUC,
ROCR,
yardstick,
future.apply,
strip
Remotes:
saraemoore/SuperSelector,
saraemoore/SLScreenExtra,
cran/kedd
License: GPL-3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
60 changes: 60 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Generated by roxygen2: do not edit by hand

export(GridLearner)
export(SL.glm.mean)
export(assess_predictions)
export(distKernelWeights)
export(predictAllYALL)
export(screen.corRank2.wgtd)
export(screen.corRank3.wgtd)
export(screen.corRank4.wgtd)
export(screen.corRank5.wgtd)
export(screen.corRank6.wgtd)
export(summarizeYALL)
export(trainYALL)
export(trainYALLfold)
import(SLScreenExtra)
import(SuperLearner)
importFrom(ROCR,performance)
importFrom(ROCR,prediction)
importFrom(SLScreenExtra,screen.wgtd.corRank)
importFrom(SuperSelector,cvSLFeatureSelector)
importFrom(SuperSelector,factor_to_indicator)
importFrom(SuperSelector,groupBySelectionSet)
importFrom(SuperSelector,makeLibArgNumeric)
importFrom(broom,tidy)
importFrom(cvAUC,ci.cvAUC)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,distinct)
importFrom(dplyr,do)
importFrom(dplyr,full_join)
importFrom(dplyr,group_by)
importFrom(dplyr,mutate)
importFrom(dplyr,mutate_all)
importFrom(dplyr,n)
importFrom(dplyr,one_of)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,summarize)
importFrom(dplyr,transmute)
importFrom(dplyr,ungroup)
importFrom(future.apply,future_lapply)
importFrom(kedd,kernel.fun)
importFrom(magrittr,`%>%`)
importFrom(methods,as)
importFrom(origami,training)
importFrom(origami,validation)
importFrom(purrr,map)
importFrom(purrr,map2)
importFrom(scales,percent)
importFrom(stats,gaussian)
importFrom(stats,glm)
importFrom(stats,predict)
importFrom(stats,setNames)
importFrom(strip,strip)
importFrom(tibble,as_tibble)
importFrom(tibble,tibble)
importFrom(tidyr,unnest)
importFrom(yardstick,pr_auc_vec)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# yall 0.1.0

Initial version.
49 changes: 49 additions & 0 deletions R/assessments.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#' Cross-validated AUC estimate with confidence interval as a formatted string
#'
#' @param yhat A vector of predicted outcome values
#' @param y A vector of observed outcome values
#' @param folds Vector of fold IDs. If \code{NULL}, the standard error will be
#' estimated as though there were a single fold (no cross-validation).
#' @param confidence Confidence level; defaults to \code{0.95}
#' @return A character string
#' @importFrom cvAUC ci.cvAUC
pretty_cvauc_ci <- function(yhat, y, folds = NULL, confidence = 0.95) {
res_cvauc <- cvAUC::ci.cvAUC(predictions = yhat, labels = y,
folds = folds, confidence = confidence)
sprintf("AUC = %.3f (%.3f, %.3f)", res_cvauc$cvAUC, res_cvauc$ci[1], res_cvauc$ci[2])
}

#' Prepare data.frames for multiple model assessment visualizations
#'
#' @param yhat A vector of predicted outcome values
#' @param y A vector of observed outcome values
#' @param yhat_label Optional label to describe the model output. Useful when
#' multiple models will be compared in a single visualization. Defaults to
#' "Predictions".
#' @return A named list containing 4 \code{data.frame}s: "res" (containing
#' predicted and observed outcome values), "roc" (ROC curve), "pr"
#' (precision-recall curve), and "acc" (accuracy curve).
#' @importFrom yardstick pr_auc_vec
#' @importFrom ROCR prediction performance
#' @export
assess_predictions <- function(yhat, y, yhat_label = "Predictions") {
auc_res <- pretty_cvauc_ci(yhat, y)

aupr_res <- yardstick::pr_auc_vec(factor(y, levels = c(1, 0)), yhat, event_level = "first")

roc_pred <- ROCR::prediction(yhat, y)
roc_res <- ROCR::performance(roc_pred, measure = "tpr", x.measure = "fpr")
prec_res <- ROCR::performance(roc_pred, measure = "prec", x.measure = "rec")
acc_res <- ROCR::performance(roc_pred, measure = "acc")

roc_auc_df <- data.frame(fpr = roc_res@x.values[[1]], tpr=roc_res@y.values[[1]],
auc.details = auc_res)
prec_rec_df <- data.frame(rec = prec_res@x.values[[1]], prec = prec_res@y.values[[1]],
auc.details = aupr_res)
acc_df <- data.frame(cutoff = acc_res@x.values[[1]], acc = acc_res@y.values[[1]])

roc_auc_df$method = paste(yhat_label, auc_res, sep = "\n")
prec_rec_df$method = paste(yhat_label, paste("AUCPR =", round(aupr_res, 3)), sep = "\n")

return(list(res = data.frame(y = y, yhat = yhat), roc = roc_auc_df, pr = prec_rec_df, acc = acc_df))
}
123 changes: 123 additions & 0 deletions R/feat_scale_norm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#' Calculate the magnitude of a vector
#'
#' @param x a numeric vector
#' @param f a character string representing one of the following functions:
#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
#' * \code{L2}: Euclidean aka 2-norm or L2-norm
#' * \code{L3}: Minkowski aka 3-norm or L3-norm
#' * \code{L4}: Minkowski aka 4-norm or L4-norm
#' * \code{L5}: Minkowski aka 5-norm or L5-norm
#' * \code{L6}: Minkowski aka 6-norm or L6-norm
#' * \code{max}: Chebyshev aka infinity norm
#' @return a numeric value
vectorNorm = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
f = match.arg(f)
switch(as.character(f),
L1 = sum(abs(x)),
L2 = sqrt(sum(x^2)),
L3 = sum(abs(x)^3)^(1/3),
L4 = sum(x^4)^(1/4),
L5 = sum(abs(x)^5)^(1/5),
L6 = sum(x^6)^(1/6),
max = max(abs(x))
# TODO: add Mahalanobis distance? might not work
)
}

#' Calculate the magnitude of row-vectors
#'
#' @param x A \code{data.frame} or \code{matrix}
#' @param f a character string representing one of the following functions:
#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
#' * \code{L2}: Euclidean aka 2-norm or L2-norm
#' * \code{L3}: Minkowski aka 3-norm or L3-norm
#' * \code{L4}: Minkowski aka 4-norm or L4-norm
#' * \code{L5}: Minkowski aka 5-norm or L5-norm
#' * \code{L6}: Minkowski aka 6-norm or L6-norm
#' * \code{max}: Chebyshev aka infinity norm
#' @return a numeric vector
rowNorms = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
f = match.arg(f)
switch(as.character(f),
L1 = rowSums(abs(x)),
L2 = sqrt(rowSums(x^2)),
L3 = rowSums(abs(x)^3)^(1/3),
L4 = rowSums(x^4)^(1/4),
L5 = rowSums(abs(x)^5)^(1/5),
L6 = rowSums(x^6)^(1/6),
max = apply(abs(x), 1, max)
# TODO: add Mahalanobis distance? might not work
)
# canberra aka weighted L_1 distance: not useful here b/c centroid is 0 for all features
# One-minus-Pearson-correlation: also not useful here b/c centroid is 0 for all features
}

#' Normalize a dataset to a single observation
#'
#' Normalize a dataset \code{X} to a single observation \code{obs} by
#' subtracting the observation's values and dividing by the L1 norm.
#'
#' @param obs A \code{data.frame} containing a single record/row (typically a
#' validation/test set observation) to which \code{X} will be normalized.
#' @param X A \code{data.frame} (typically containing the training set) to be
#' normalized.
#' @param keepX Character vector containing names of retained features.
#' @return A \code{data.frame}
#' @importFrom dplyr bind_rows select one_of mutate_all slice n
#' @importFrom magrittr `%>%`
normalizeFeatures <- function(obs, X, keepX = colnames(X)) {
bind_rows(obs, X) %>%
# only keep columns of X in keepX
select(one_of(keepX)) %>%
# center each column by its value in obs
mutate_all(list(~ . - .[1])) %>%
# scale each column by its L1 norm.
# could instead do norm(as.matrix(.), "1"), but sum(abs(.)) is faster.
# when a column is completely homogeneous, this step produces all NaNs:
mutate_all(list(~ ./vectorNorm(.))) %>%
# remove obs that was prepended
slice(2:n()) %>%
# tibbles are trouble
as.data.frame()
}

#' Estimate the record-level distance from \code{validX} to \code{trainX}
#'
#' @param normF a character string representing one of the following functions:
#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
#' * \code{L2}: Euclidean aka 2-norm or L2-norm
#' * \code{L3}: Minkowski aka 3-norm or L3-norm
#' * \code{L4}: Minkowski aka 4-norm or L4-norm
#' * \code{L5}: Minkowski aka 5-norm or L5-norm
#' * \code{L6}: Minkowski aka 6-norm or L6-norm
#' * \code{max}: Chebyshev aka infinity norm
#' @param validX A \code{matrix}, \code{data.frame}, or numeric vector
#' containing one or more observations to which \code{trainX} should be
#' normalized (typically a validation/test set).
#' @param trainX A \code{data.frame} (typically containing the training set) to
#' be normalized.
#' @param keepX Character vector containing names of retained features.
#' @param verbose Currently unused. A boolean indicating whether diagnostic
#' messages should be printed. Defaults to \code{FALSE}.
#' @return A \code{matrix}
distByRows <- function(normF = c("L1", "L2", "L3", "L4", "L5", "L6", "max"),
validX, trainX, keepX, verbose = FALSE) {
normF = match.arg(normF)

if(is.vector(validX)|is.matrix(validX)) {
if(is.vector(validX)) {
validX = as.data.frame(t(validX))
}
colnames(validX) = colnames(trainX)
}

# for every validation set observation:
newTrainX = apply(validX, 1, normalizeFeatures, trainX, keepX)
# now we have a list with length = number of rows in validX
# each element is a new trainX data.frame tailored to that validation set observation

# sapply(norms, function(d) t(sapply(newTrainX, rowNorms, d)), simplify = FALSE)
res = sapply(newTrainX, rowNorms, normF)
# TODO: parallelize
return(t(res))
}
Loading

0 comments on commit 7ae8e36

Please sign in to comment.