Merge pull request #437 from SebKrantz/development

Development
SebKrantz · Jul 20, 2023 · cbd32f3 · cbd32f3
2 parents 7a3daf8 + fc170ab
commit cbd32f3
Show file tree

Hide file tree

Showing 11 changed files with 115 additions and 6 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: collapse
 Title: Advanced and Fast Data Transformation
 Version: 1.9.6.9000
-Date: 2023-06-14
+Date: 2023-07-20
 Authors@R: c(
  person("Sebastian", "Krantz", role = c("aut", "cre"), 
  email = "[email protected]"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -373,6 +373,7 @@ importFrom("stats", "as.formula", "complete.cases", "cor", "cov", "var", "pt",
  export(fnlevels)
  export(roworder)
  export(roworderv)
+ export(rowbind)
  export(frename)
  export(rnm)
  export(setrename)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # collapse 1.9.6.9000
 
+* Added `rowbind()`: a fast class-agnostic alternative to `rbind.data.frame()` and `data.table::rbindlist()`. 
+
 * Fixed a bug in the integer methods of `fsum()`, `fmean()` and `fprod()` that returned `NA` if and only if there was a single integer followed by `NA`'s e.g `fsum(c(1L, NA, NA))` erroneously gave `NA`. This was caused by a C-level shortcut that returned `NA` when the first element of the vector had been reached (moving from back to front) without encountering any non-NA-values. The bug consisted in the content of the first element not being evaluated in this case. Note that this bug did not occur with real numbers, and also not in grouped execution. Thanks @blset for reporting (#432). 
 
 # collapse 1.9.6

diff --git a/R/unlist2d.R b/R/unlist2d.R
@@ -1,4 +1,42 @@
 
+# rbind_list ??
+rowbind <- function(..., idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE,
+ return = c("as.first", "data.frame", "data.table", "tibble", "list")) {
+
+ l <- if(...length() == 1L && is.list(..1)) unclass(..1) else list(...)
+ id_fact <- length(idcol) && !isFALSE(id.factor) && length(nam <- names(l))
+ if(id_fact) names(l) <- NULL
+ res <- .Call(C_rbindlist, l, use.names || fill, fill, idcol)
+ if(id_fact) {
+ attr(res[[1L]], "levels") <- nam
+ oldClass(res[[1L]]) <- switch(id.factor, `TRUE` = "factor", ordered = c("ordered", "factor"),
+ stop('id.factor needs to be FALSE, TRUE or "ordered"'))
+ }
+ switch(return[1L],
+ as.first = {
+ a1 <- attributes(.subset2(l, 1L))
+ if(is.null(a1)) return(res)
+ n <- .Call(C_fnrow, res)
+ if(any(a1$class == "data.frame")) {
+ rn <- a1$row.names
+ if(!(is.numeric(rn) || is.null(rn) || rn[1L] == "1")) { # data.frame's
+ all_rn <- do.call(c, lapply(unattrib(l), attr, "row.names"))
+ a1$row.names <- if(length(all_rn) == n) all_rn else .set_row_names(n)
+ } else a1$row.names <- .set_row_names(n)
+ }
+ a1$names <- names(res)
+ .Call(C_setattributes, res, a1)
+ if(any(a1$class == "data.table")) return(alc(res))
+ res
+ },
+ data.frame = qDF(res),
+ data.table = qDT(res),
+ tibble = qTBL(res),
+ list = res,
+ stop("Unknown return option: ", return[1L])
+ )
+}
+
 unlist2d <- function(l, idcols = ".id", row.names = FALSE, recursive = TRUE, id.factor = FALSE, DT = FALSE) {
 
  if (!is.list(l)) return(l) # stop("l is not a list")

diff --git a/man/GRP.Rd b/man/GRP.Rd
@@ -199,7 +199,7 @@ Creating a factor from a 'GRP' object using \code{as_factor_GRP} does not involv
  }
 }
 \seealso{
-\code{\link{radixorder}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview}
+\code{\link{radixorder}}, \code{\link{group}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview}
 }
 \examples{
 ## default method

diff --git a/man/collapse-documentation.Rd b/man/collapse-documentation.Rd
@@ -23,8 +23,8 @@ The following table fully summarizes the contents of \emph{\link{collapse}}. The
 
 % (speed about 2x '[' for selecting and 4x '[<-' for replacing). %, get data, variables names, variable indices
 
-\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type.
-\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr
+\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, combine, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type.
+\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link{rowbind}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr
 
 \link[=quick-conversion]{Quick Data Conversion} \tab\tab Quick conversions: data.frame <> data.table <> tibble <> matrix (row- or column-wise) <> list | array > matrix, data.frame, data.table, tibble | vector > factor, matrix, data.frame, data.table, tibble; and converting factors / all factor columns. \tab\tab \code{qDF}, \code{qDT}, \code{qTBL}, \code{qM}, \code{qF}, \code{mrtl}, \code{mctl}, \code{as_numeric_factor}, \code{as_character_factor} \cr \cr \cr
 

diff --git a/man/fast-data-manipulation.Rd b/man/fast-data-manipulation.Rd
@@ -12,6 +12,8 @@
 
 \item \code{\link{add_vars}} efficiently adds new columns at any position within a data frame (default at the end). This can be done vie replacement (i.e. \code{add_vars(data) <- newdata}) or returning the appended data (i.e. \code{add_vars(data, newdata1, newdata2, \dots)}). Because of the latter, \code{add_vars} is also a more efficient alternative to \code{cbind.data.frame}.
 
+\item \code{\link{rowbind}} efficiently combines data frames / lists row-wise. The implementation is derived from \code{data.table::rbindlist}, it is also a fast alternative to \code{rbind.data.frame}.
+
 \item \code{\link{fsubset}} is a much faster version of \code{\link{subset}} to efficiently subset vectors, matrices and data frames. If the non-standard evaluation offered by \code{\link{fsubset}} is not needed, the function \code{\link{ss}} is a much faster and also more secure alternative to \code{[.data.frame}.
 
 \item \code{\link{fsummarise}} is a much faster version of \code{dplyr::summarise} when used together with the \link[=fast-statistical-functions]{Fast Statistical Functions} and \code{\link{fgroup_by}}, with whom it also supports super fast weighted aggregation.

diff --git a/man/psacf.Rd b/man/psacf.Rd
@@ -63,7 +63,7 @@ psccf(x, y, \dots)
 \details{
 If \code{gscale = TRUE} data are standardized within each group (using \code{\link{fscale}}) such that the group-mean is 0 and the group-standard deviation is 1. This is strongly recommended for most panels to get rid of individual-specific heterogeneity which would corrupt the ACF computations.
 
-After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then correlating this matrix with the series (\code{x, y}) using \code{\link{cor}} and pairwise-complete observations. This may require a lot of memory on large data, but is done because passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cor}} one time is much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}.
+After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then computing the covariance of this matrix with the series (\code{x, y}) using \code{\link{cov}} and pairwise-complete observations, and dividing by the variance (of \code{x, y}). Creating the lag matrix may require a lot of memory on large data, but passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cov}} one time is generally much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}.
 }
 \value{
 An object of class 'acf', see \code{\link{acf}}. The result is returned invisibly if \code{plot = TRUE}.}

diff --git a/man/rowbind.Rd b/man/rowbind.Rd
@@ -0,0 +1,57 @@
+\name{rowbind}
+\alias{rowbind}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Row-Binding Lists / Data Frame-Like Objects
+}
+\description{
+\emph{collapse}'s version of \code{data.table::rbindlist} and \code{rbind.data.frame}. The core code is copied from \emph{data.table}, which deserves all credit for the implementation. \code{rowbind} only binds (non-nested) lists /data.frame's. For a more flexible recursive version see \code{\link{unlist2d}}. To combine lists column-wise see \code{\link{add_vars}} or \code{\link{ftransform}} (with replacement).
+}
+\usage{
+rowbind(\dots, idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE,
+ return = c("as.first", "data.frame", "data.table", "tibble", "list"))
+}
+\arguments{
+ \item{\dots}{a single list of list-like objects (data.frames) or comma separated objects (internally assembled using \code{list(\dots)}). Names can be supplied if \code{!is.null(idcol)}.}
+
+ \item{idcol}{character. The name of an id-column to be generated identifying the source of rows in the final object. Using \code{idcol = TRUE} will set the name to \code{".id"}. If the input list has names, these will form the content of the id column, otherwise integers are used. To save memory, it is advised to keep \code{id.factor = TRUE}.}
+
+ \item{use.names}{logical. \code{TRUE} binds by matching column name, \code{FALSE} by position. }
+
+ \item{fill}{logical. \code{TRUE} fills missing columns with NAs. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.}
+
+ \item{id.factor}{logical. if \code{TRUE}, \code{!isFALSE(idcols)}, and the input list is names, create id column as factor instead of character vector. It is also possible to specify \code{id.factor = "ordered"} to generate an ordered factor id. This is much more memory efficient than a character id, and thus enabled by default. }
+
+\item{return}{an integer or string specifying what to return. \code{1 - "as.first"} preserves the attributes of the first element of the list, \code{2/3/4 - "data.frame"/"data.table"/"tibble"} coerces to specific objects, and \code{5 - "list"} returns a (named) list. }
+
+}
+
+\value{
+a long list or data frame-like object formed by combining the rows / elements of the input objects. The \code{return} argument controls the exact format of the output.
+}
+
+
+\seealso{
+\code{\link{unlist2d}}, \code{\link{add_vars}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
+}
+\examples{
+# These are the same
+rowbind(mtcars, mtcars)
+rowbind(list(mtcars, mtcars))
+
+# With id column
+rowbind(mtcars, mtcars, idcol = "id")
+rowbind(a = mtcars, b = mtcars, idcol = "id") # by default factor to save memory
+
+# Filling up columns
+rowbind(mtcars, mtcars[2:8], fill = TRUE)
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory (show via RShowDoc("KEYWORDS")):
+\keyword{manip}
+% \keyword{ ~kwd2 }
+% Use only one keyword per line.
+% For non-standard keywords, use \concept instead of \keyword:
+% \concept{ ~cpt1 }
+% \concept{ ~cpt2 }
+% Use only one concept per line.
diff --git a/man/select_replace_vars.Rd b/man/select_replace_vars.Rd
@@ -115,7 +115,7 @@ Thus they can freely be applied to data.table's, grouped tibbles, panel data fra
 In many cases functions here only check the length of the first column, which is one of the reasons why they are so fast. When lists of unequal-length columns are offered as replacements this yields a malformed data frame (which will also print a warning in the console i.e. you will notice that).
 }
 \seealso{
-\code{\link{fsubset}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
+\code{\link{fsubset}}, \code{\link{ftransform}}, \code{\link{rowbind}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
 }
 \examples{
 ## Wold Development Data

diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R
@@ -356,3 +356,12 @@ test_that("fdist works properly", {
  set_collapse(oldopts)
  }
 })
+
+test_that("rowbind", {
+ expect_equal(rowbind(mtcars, mtcars), setRownames(rbind(mtcars, mtcars), rep(attr(mtcars, "row.names"), 2)))
+ expect_equal(setRownames(rowbind(list(mtcars, mtcars))), setRownames(rbind(mtcars, mtcars)))
+ expect_equal(setRownames(rowbind(mtcars, mtcars)), unlist2d(list(mtcars, mtcars), idcols = FALSE))
+ expect_equal(setRownames(rowbind(mtcars, mtcars, idcol = "id")), unlist2d(list(mtcars, mtcars), idcols = "id"))
+ expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id")), unlist2d(list(a = mtcars, b = mtcars), idcols = "id", id.factor = TRUE))
+ expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id", id.factor = FALSE)), unlist2d(list(a = mtcars, b = mtcars), idcols = "id"))
+})