Merge pull request #599 from SebKrantz/master

Update
SebKrantz · Jun 10, 2024 · c97e35a · c97e35a
2 parents 9029692 + 0e4454b
commit c97e35a
Show file tree

Hide file tree

Showing 13 changed files with 933 additions and 89 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -29,5 +29,8 @@ _snaps
 ^\.DS_Store$
 ^revdep$
 \.orig$
+vignettes/figure
+vignettes/cache
+
 
 
diff --git a/NEWS.md b/NEWS.md
@@ -2,26 +2,49 @@
 
 * `pivot()` has new arguments `FUN = "last"` and `FUN.args = NULL`, allowing wide and recast pivots with aggregation (default last value as before). `FUN` currently supports a single function returning a scalar value. *Fast Statistical Functions* receive vectorized execution. `FUN.args` can be used to supply a list of function arguments, including data-length arguments such as weights. There are also a couple of internal functions callable using function strings: `"first"`, `"last"`, `"count"`, `"sum"`, `"mean"`, `"min"`, or `"max"`. These are built into the reshaping C-code and thus extremely fast. Thanks @AdrianAntico for the request (#582).
 
+* `join()` now provides enhanced verbosity, indicating the average order of the join between the two tables, e.g.
+ ``` r
+ join(data.frame(id = c(1, 2, 2, 4)), data.frame(id = c(rep(1,4), 2:3)))
+ #> left join: x[id] 3/4 (75%) <1.5:1st> y[id] 2/6 (33.3%)
+ #> id
+ #> 1 1
+ #> 2 2
+ #> 3 2
+ #> 4 4
+ join(data.frame(id = c(1, 2, 2, 4)), data.frame(id = c(rep(1,4), 2:3)), multiple = TRUE)
+ #> left join: x[id] 3/4 (75%) <1.5:2.5> y[id] 5/6 (83.3%)
+ #> id
+ #> 1 1
+ #> 2 1
+ #> 3 1
+ #> 4 1
+ #> 5 2
+ #> 6 2
+ #> 7 4
+ ```
+
+* In `collap()`, with multiple functions passed to `FUN` or `catFUN` and `return = "long"`, the `"Function"` column is now generated as a factor variable instead of character (which is more efficient). 
+
 # collapse 2.0.14
 
 * Updated '*collapse* and *sf*' vignette to reflect the recent support for *units* objects, and added a few more examples.
 
 * Fixed a bug in `join()` where a full join silently became a left join if there are no matches between the tables (#574). Thanks @D3SL for reporting. 
 
 * Added function `group_by_vars()`: A standard evaluation version of `fgroup_by()` that is slimmer and safer for programming, e.g. `data |> group_by_vars(ind1) |> collapg(custom = list(fmean = ind2, fsum = ind3))`. Or, using *magrittr*: 
-```r 
-library(magrittr)
-set_collapse(mask = "manip") # for fgroup_vars -> group_vars
-
-data %>% 
- group_by_vars(ind1) %>% {
- add_vars(
- group_vars(., "unique"),
- get_vars(., ind2) %>% fmean(keep.g = FALSE) %>% add_stub("mean_"),
- get_vars(., ind3) %>% fsum(keep.g = FALSE) %>% add_stub("sum_")
- ) 
-}
-```
+ ```r 
+ library(magrittr)
+ set_collapse(mask = "manip") # for fgroup_vars -> group_vars
+ 
+ data %>% 
+  group_by_vars(ind1) %>% {
+  add_vars(
+  group_vars(., "unique"),
+  get_vars(., ind2) %>% fmean(keep.g = FALSE) %>% add_stub("mean_"),
+  get_vars(., ind3) %>% fsum(keep.g = FALSE) %>% add_stub("sum_")
+  ) 
+ }
+ ```
 
 * Added function `as_integer_factor()` to turn factors/factor columns into integer vectors. `as_numeric_factor()` already exists, but is memory inefficient for most factors where levels can be integers. 
 

diff --git a/R/collap.R b/R/collap.R
@@ -79,6 +79,14 @@ applyfuns_internal <- function(data, by, FUN, fFUN, parallel, cores, ...) {
  return(list(BY.data.frame(data, by, FUN, ..., use.g.names = FALSE, reorder = FALSE, return = "data.frame"))) # return(list(lapply(data, copysplaplfun, by, FUN, ...)))
 }
 
+rbindlist_factor <- function(l, idcol = "Function") {
+ nam <- names(l)
+ names(l) <- NULL
+ res <- .Call(C_rbindlist, l, TRUE, TRUE, idcol)
+ attr(res[[1L]], "levels") <- if (length(nam)) nam else as.character(seq_along(l))
+ oldClass(res[[1L]]) <- "factor"
+ res
+}
 
 
 # NOTE: CUSTOM SEPARATOR doesn't work because of unlist() !
@@ -284,8 +292,7 @@ collap <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, w = NULL, wF
  condalcSA(c(res[[1L]], e), ax, DTl) }))
  } else {
  if(return != 4L) {
- res <- if(!keep.by) .Call(C_rbindlist, res, TRUE, TRUE, "Function") else # data.table:::Crbindlist
- .Call(C_rbindlist, lapply(res[-1L], function(e) c(res[[1L]], e)), TRUE, TRUE, "Function")
+ if(keep.by) res <- lapply(res[-1L], function(e) c(res[[1L]], e))
  } else {
  if(!ncustoml || !(nul && nnul)) stop("long_dupl is only meaningful for aggregations with both numeric and categorical data, and multiple functions used for only one of the two data types!")
  mFUN <- length(FUN) > 1L
@@ -295,8 +302,8 @@ collap <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, w = NULL, wF
  lapply(res[-nid], function(e) c(res[[nid]], e))
  } else res <- if(mFUN) lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], e, res[[nid]])) else
  lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], res[[nid]], e))
- res <- .Call(C_rbindlist, res, FALSE, FALSE, "Function")
  }
+ res <- rbindlist_factor(res)
  if(keep.col.order) o <- if(ncustoml) forder.int(c(0L, if(!keep.by) NULL else if(!bycalll) rep(0L,length(numby)) else numby, nu, nnu)) else c(1L, o + 1L)
  }
  # } else message("return options other than 'wide' are only meaningful if multiple functions are used!")
@@ -475,8 +482,7 @@ collapv <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, w = NULL, w
  condalcSA(c(res[[1L]], e), ax, DTl) }))
  } else {
  if(return != 4L) {
- res <- if(!keep.by) .Call(C_rbindlist, res, TRUE, TRUE, "Function") else # data.table:::Crbindlist
- .Call(C_rbindlist, lapply(res[-1L], function(e) c(res[[1L]], e)), TRUE, TRUE, "Function")
+ if(keep.by) res <- lapply(res[-1L], function(e) c(res[[1L]], e))
  } else {
  if(!ncustoml || !(nul && nnul)) stop("long_dupl is only meaningful for aggregations with both numeric and categorical data, and multiple functions used for only one of the two data types!")
  mFUN <- length(FUN) > 1L
@@ -486,8 +492,8 @@ collapv <- function(X, by, FUN = fmean, catFUN = fmode, cols = NULL, w = NULL, w
  lapply(res[-nid], function(e) c(res[[nid]], e))
  } else res <- if(mFUN) lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], e, res[[nid]])) else
  lapply(res[-c(nid, 1L)], function(e) c(res[[1L]], res[[nid]], e))
- res <- .Call(C_rbindlist, res, FALSE, FALSE, "Function")
  }
+ res <- rbindlist_factor(res)
  if(keep.col.order) o <- if(ncustoml) forder.int(c(0L, if(!keep.by) NULL else numby, nu, nnu)) else c(1L, o + 1L)
  }
  # } else message("return options other than 'wide' are only meaningful if multiple functions are used!")

diff --git a/R/join.R b/R/join.R
@@ -103,9 +103,11 @@ join <- function(x, y,
 
  if(multiple) {
  g <- group(if(rjoin) x[ixon] else y[iyon], group.sizes = TRUE)
+ if(verbose) mi <- m
  m <- multi_match(m, g)
  if(is.list(m)) {
  multiple <- 2L
+ # TODO: Optimize if drop.dup.cols
  if(rjoin) y <- .Call(C_subsetDT, y, m[[1L]], seq_along(y), FALSE)
  else x <- .Call(C_subsetDT, x, m[[1L]], seq_along(x), FALSE)
  m <- m[[2L]]
@@ -114,7 +116,8 @@ join <- function(x, y,
  }
 
  if(verbose) {
- nx <- length(m) - attr(m, "N.nomatch")
+ Nx <- length(if(multiple) mi else m)
+ nx <- Nx - attr(m, "N.nomatch")
  ny <- attr(m, "N.distinct")
  Ny <- attr(m, "N.groups")
  if(verbose == 2L) {
@@ -124,16 +127,29 @@ join <- function(x, y,
  cin_x <- xon
  cin_y <- on
  }
- xstat <- paste0(nx, "/", length(m), " (", signif(nx/length(m)*100, 3), "%)")
+ xstat <- paste0(nx, "/", Nx, " (", signif(nx/Nx*100, 3), "%)")
  ystat <- paste0(ny, "/", Ny, " (", signif(ny/Ny*100, 3), "%)")
  if(rjoin) {
  tmp <- ystat
  ystat <- xstat
  xstat <- tmp
  }
+ if(multiple) {
+ validate <- switch(validate,
+ "1:1" = "1:1",
+ "1:m" = paste0("1:", round((if(rjoin) nx else ny) / attr(mi, "N.distinct"), 2)),
+ "m:1" = paste0(round((if(rjoin) ny else nx) / attr(mi, "N.distinct"), 2), ":1"),
+ "m:m" = paste(round(c(nx, ny)[if(rjoin) 2:1 else 1:2] / attr(mi, "N.distinct"), 2), collapse = ":"))
+ } else {
+ validate <- switch(validate,
+ "1:1" = "1:1",
+ "1:m" = paste0("1:", if(rjoin) round(nx / ny, 2) else "1st"),
+ "m:1" = paste0(if(rjoin) "1st" else round(nx / ny, 2), ":1"),
+ "m:m" = paste(c(round(nx / ny, 2), "1st")[if(rjoin) 2:1 else 1:2], collapse = ":"))
+ }
  cat(how, " join: ",
  x_name, "[", paste(cin_x, collapse = ", "), "] ",
- xstat, " <", validate ,"> ",
+ xstat, " <", validate , "> ",
  y_name, "[", paste(cin_y, collapse = ", "), "] ",
  ystat, "\n", sep = "")
  }
@@ -269,7 +285,7 @@ join <- function(x, y,
  } else matched <- "matched"
  # TODO: better?
  # matched <- paste0(y_name, "_", y_name)
- mc <- switch(how,
+ mc <- switch(how, left_setrn =,
  left = structure(is.na(m) + 1L, levels = c(matched, x_name), class = c("factor", "na.included")),
  right = structure(is.na(m) + 1L, levels = c(matched, y_name), class = c("factor", "na.included")),
  full = structure(vec(list(is.na(m) + 1L, alloc(3L, fnrow(res)-length(m)))), levels = c(matched, x_name, y_name), class = c("factor", "na.included")),

diff --git a/man/collapse-renamed.Rd b/man/collapse-renamed.Rd
@@ -47,7 +47,7 @@ Renamed Functions
 \description{
 These functions were renamed (mostly during v1.6.0 update) to make the namespace more consistent. Except for the S3 generics of \code{fNobs}, \code{fNdistinct}, \code{fHDbetween} and \code{fHDwithin}, and functions \code{replace_NA} and \code{replace_Inf}, I intend to remove all of these functions by end of 2023. %The S3 generics and the other functions will be depreciated in 2023 for the earliest. These all now give a message reminding you not to use them in fresh code.
 }
-\section{Renaming}{\preformatted{
+\section{Renaming}{\if{html}{\out{<div class="sourceCode r">}}\preformatted{
 fNobs -> fnobs
 fNdistinct -> fndistinct
 pwNobs -> pwnobs
@@ -66,5 +66,5 @@ Date_vars -> date_vars
 `Date_vars<-` -> `date_vars<-`
 replace_NA -> replace_na
 replace_Inf -> replace_inf
-}
+}\if{html}{\out{</div>}}
 }
diff --git a/man/fast-statistical-functions.Rd b/man/fast-statistical-functions.Rd
@@ -7,7 +7,7 @@
 \description{
 With \code{\link{fsum}}, \code{\link{fprod}}, \code{\link{fmean}}, \code{\link{fmedian}}, \code{\link{fmode}}, \code{\link{fvar}}, \code{\link{fsd}}, \code{\link{fmin}}, \code{\link{fmax}}, \code{\link{fnth}}, \code{\link{ffirst}}, \code{\link{flast}}, \code{\link{fnobs}} and \code{\link{fndistinct}}, \emph{collapse} presents a coherent set of extremely fast and flexible statistical functions (S3 generics) to perform column-wise, grouped and weighted computations on vectors, matrices and data frames, with special support for grouped data frames / tibbles (\emph{dplyr}) and \emph{data.table}'s.
 }
-\section{Usage}{\preformatted{
+\section{Usage}{\if{html}{\out{<div class="sourceCode r">}}\preformatted{
 ## All functions (FUN) follow a common syntax in 4 methods:
 FUN(x, ...)
 
@@ -27,7 +27,7 @@ FUN(x, g = NULL, [w = NULL,] TRA = NULL, [na.rm = TRUE,]
 FUN(x, [w = NULL,] TRA = NULL, [na.rm = TRUE,]
  use.g.names = FALSE, keep.group_vars = TRUE,
  [keep.w = TRUE,] [stub = TRUE,] [nthreads = 1L,] ...)
-}
+}\if{html}{\out{</div>}}
 }
 \section{Arguments}{
 \tabular{lll}{
@@ -75,7 +75,7 @@ Please see the documentation of individual functions.
 \seealso{
 \link[=collapse-documentation]{Collapse Overview}, \link[=data-transformations]{Data Transformations}, \link[=time-series-panel-series]{Time Series and Panel Series}
 }
-\section{Examples}{\preformatted{
+\section{Examples}{\if{html}{\out{<div class="sourceCode r">}}\preformatted{
 ## default vector method
 mpg <- mtcars$mpg
 fsum(mpg) # Simple sum
@@ -106,18 +106,17 @@ fmode(wlddev, wlddev$income) # Grouped statistical modes ..
 m <- qM(mtcars)
 fsum(m)
 fsum(m, g) # ..
-\donttest{ % The tidyverse regularly causes havoc to CRAN tests in other packages, therefore this is not tested
+
 ## method for grouped data frames - created with dplyr::group_by or fgroup_by
 library(dplyr)
 mtcars |> group_by(cyl,vs,am) |> select(mpg,carb) |> fsum()
 mtcars |> fgroup_by(cyl,vs,am) |> fselect(mpg,carb) |> fsum() # equivalent and faster !!
 mtcars |> fgroup_by(cyl,vs,am) |> fsum(TRA = "\%")
 mtcars |> fgroup_by(cyl,vs,am) |> fmean(hp) # weighted grouped mean, save sum of weights
 mtcars |> fgroup_by(cyl,vs,am) |> fmean(hp, keep.group_vars = FALSE)
+}\if{html}{\out{</div>}}
 }
-}
-}
-\section{Benchmark}{\preformatted{
+\section{Benchmark}{\if{html}{\out{<div class="sourceCode r">}}\preformatted{
 ## This compares fsum with data.table (2 threads) and base::rowsum
 # Starting with small data
 mtcDT <- qDT(mtcars)
@@ -128,10 +127,10 @@ microbenchmark(mtcDT[, lapply(.SD, sum), by = f],
  rowsum(mtcDT, f, reorder = FALSE),
  fsum(mtcDT, f, na.rm = FALSE), unit = "relative")
 
- expr min lq mean median uq max neval cld
- mtcDT[, lapply(.SD, sum), by = f] 145.436928 123.542134 88.681111 98.336378 71.880479 85.217726 100 c
- rowsum(mtcDT, f, reorder = FALSE) 2.833333 2.798203 2.489064 2.937889 2.425724 2.181173 100 b
- fsum(mtcDT, f, na.rm = FALSE) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
+# expr min lq mean median uq max neval cld
+# mtcDT[, lapply(.SD, sum), by = f] 145.436928 123.542134 88.681111 98.336378 71.880479 85.217726 100 c
+# rowsum(mtcDT, f, reorder = FALSE) 2.833333 2.798203 2.489064 2.937889 2.425724 2.181173 100 b
+# fsum(mtcDT, f, na.rm = FALSE) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
 
 # Now larger data
 tdata <- qDT(replicate(100, rnorm(1e5), simplify = FALSE)) # 100 columns with 100.000 obs
@@ -141,11 +140,11 @@ microbenchmark(tdata[, lapply(.SD, sum), by = f],
  rowsum(tdata, f, reorder = FALSE),
  fsum(tdata, f, na.rm = FALSE), unit = "relative")
 
- expr min lq mean median uq max neval cld
- tdata[, lapply(.SD, sum), by = f] 2.646992 2.975489 2.834771 3.081313 3.120070 1.2766475 100 c
- rowsum(tdata, f, reorder = FALSE) 1.747567 1.753313 1.629036 1.758043 1.839348 0.2720937 100 b
- fsum(tdata, f, na.rm = FALSE) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100 a
-}
+# expr min lq mean median uq max neval cld
+# tdata[, lapply(.SD, sum), by = f] 2.646992 2.975489 2.834771 3.081313 3.120070 1.2766475 100 c
+# rowsum(tdata, f, reorder = FALSE) 1.747567 1.753313 1.629036 1.758043 1.839348 0.2720937 100 b
+# fsum(tdata, f, na.rm = FALSE) 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100 a
+}\if{html}{\out{</div>}}
 }
 
 \keyword{univar}

diff --git a/man/flm.Rd b/man/flm.Rd
@@ -4,20 +4,19 @@
 \alias{flm.formula}
 
 \title{
-Fast (Weighted) Linear Model Fitting % Sparse,
+Fast (Weighted) Linear Model Fitting
 }
 \description{
 \code{flm} is a fast linear model command that (by default) only returns a coefficient matrix. 6 different efficient fitting methods are implemented: 4 using base R linear algebra, and 2 utilizing the \emph{RcppArmadillo} and \emph{RcppEigen} packages. The function itself only has an overhead of 5-10 microseconds, and is thus well suited as a bootstrap workhorse.
 }
 \usage{
 flm(...) # Internal method dispatch: default if is.atomic(..1)
 
-\method{flm}{default}(y, X, w = NULL, add.icpt = FALSE, return.raw = FALSE, % sparse = FALSE
+\method{flm}{default}(y, X, w = NULL, add.icpt = FALSE, return.raw = FALSE,
  method = c("lm", "solve", "qr", "arma", "chol", "eigen"),
  eigen.method = 3L, ...)
 
 \method{flm}{formula}(formula, data = NULL, weights = NULL, add.icpt = TRUE, ...)
-
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{