From 5935d335f14f1c76d366d67ce83df0e696d74f78 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Thu, 20 Jul 2023 13:30:54 +0200 Subject: [PATCH 1/5] Minor documentation fix. --- man/psacf.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/psacf.Rd b/man/psacf.Rd index f0132492..21d664ce 100644 --- a/man/psacf.Rd +++ b/man/psacf.Rd @@ -63,7 +63,7 @@ psccf(x, y, \dots) \details{ If \code{gscale = TRUE} data are standardized within each group (using \code{\link{fscale}}) such that the group-mean is 0 and the group-standard deviation is 1. This is strongly recommended for most panels to get rid of individual-specific heterogeneity which would corrupt the ACF computations. -After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then correlating this matrix with the series (\code{x, y}) using \code{\link{cor}} and pairwise-complete observations. This may require a lot of memory on large data, but is done because passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cor}} one time is much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}. +After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then computing the covariance of this matrix with the series (\code{x, y}) using \code{\link{cov}} and pairwise-complete observations, and dividing by the variance (of \code{x, y}). Creating the lag matrix may require a lot of memory on large data, but passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cov}} one time is generally much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}. } \value{ An object of class 'acf', see \code{\link{acf}}. The result is returned invisibly if \code{plot = TRUE}.} From 893bf0b7683af7cfa6b3f8ade40fd94ebe75b2f6 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Thu, 20 Jul 2023 13:31:20 +0200 Subject: [PATCH 2/5] Linking to group() as well. --- man/GRP.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/GRP.Rd b/man/GRP.Rd index c4a528f7..5e07fb8d 100644 --- a/man/GRP.Rd +++ b/man/GRP.Rd @@ -199,7 +199,7 @@ Creating a factor from a 'GRP' object using \code{as_factor_GRP} does not involv } } \seealso{ -\code{\link{radixorder}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview} +\code{\link{radixorder}}, \code{\link{group}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview} } \examples{ ## default method From be545e441c11a42d85cf558851944afd260f4deb Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Thu, 20 Jul 2023 14:13:07 +0200 Subject: [PATCH 3/5] Adding rowbind(): fast class-agnostic row-wise combining data. --- NAMESPACE | 1 + NEWS.md | 2 ++ R/unlist2d.R | 38 +++++++++++++++++++++++ man/collapse-documentation.Rd | 4 +-- man/fast-data-manipulation.Rd | 2 ++ man/rowbind.Rd | 58 +++++++++++++++++++++++++++++++++++ man/select_replace_vars.Rd | 2 +- tests/testthat/test-misc.R | 9 ++++++ 8 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 man/rowbind.Rd diff --git a/NAMESPACE b/NAMESPACE index 68a7d5dd..a33625b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -373,6 +373,7 @@ importFrom("stats", "as.formula", "complete.cases", "cor", "cov", "var", "pt", export(fnlevels) export(roworder) export(roworderv) + export(rowbind) export(frename) export(rnm) export(setrename) diff --git a/NEWS.md b/NEWS.md index e692e2c9..5b6ababc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # collapse 1.9.6.9000 +* Added `rowbind()`: a fast class-agnostic alternative to `rbind.data.frame()` and `data.table::rbindlist()`. + * Fixed a bug in the integer methods of `fsum()`, `fmean()` and `fprod()` that returned `NA` if and only if there was a single integer followed by `NA`'s e.g `fsum(c(1L, NA, NA))` erroneously gave `NA`. This was caused by a C-level shortcut that returned `NA` when the first element of the vector had been reached (moving from back to front) without encountering any non-NA-values. The bug consisted in the content of the first element not being evaluated in this case. Note that this bug did not occur with real numbers, and also not in grouped execution. Thanks @blset for reporting (#432). # collapse 1.9.6 diff --git a/R/unlist2d.R b/R/unlist2d.R index 62342e58..fc702fed 100644 --- a/R/unlist2d.R +++ b/R/unlist2d.R @@ -1,4 +1,42 @@ +# rbind_list ?? +rowbind <- function(..., idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE, + return = c("as.first", "data.frame", "data.table", "tibble", "list")) { + + l <- if(...length() == 1L && is.list(..1)) unclass(..1) else list(...) + id_fact <- length(idcol) && !isFALSE(id.factor) && length(nam <- names(l)) + if(id_fact) names(l) <- NULL + res <- .Call(C_rbindlist, l, use.names || fill, fill, idcol) + if(id_fact) { + attr(res[[1L]], "levels") <- nam + oldClass(res[[1L]]) <- switch(id.factor, `TRUE` = "factor", ordered = c("ordered", "factor"), + stop('id.factor needs to be FALSE, TRUE or "ordered"')) + } + switch(return[1L], + as.first = { + a1 <- attributes(.subset2(l, 1L)) + if(is.null(a1)) return(res) + n <- .Call(C_fnrow, res) + if(any(a1$class == "data.frame")) { + rn <- a1$row.names + if(!(is.numeric(rn) || is.null(rn) || rn[1L] == "1")) { # data.frame's + all_rn <- do.call(c, lapply(unattrib(l), attr, "row.names")) + a1$row.names <- if(length(all_rn) == n) all_rn else .set_row_names(n) + } else a1$row.names <- .set_row_names(n) + } + a1$names <- names(res) + .Call(C_setattributes, res, a1) + if(any(a1$class == "data.table")) return(alc(res)) + res + }, + data.frame = qDF(res), + data.table = qDT(res), + tibble = qTBL(res), + list = res, + stop("Unknown return option: ", return[1L]) + ) +} + unlist2d <- function(l, idcols = ".id", row.names = FALSE, recursive = TRUE, id.factor = FALSE, DT = FALSE) { if (!is.list(l)) return(l) # stop("l is not a list") diff --git a/man/collapse-documentation.Rd b/man/collapse-documentation.Rd index 0a49f9b9..7c288626 100644 --- a/man/collapse-documentation.Rd +++ b/man/collapse-documentation.Rd @@ -23,8 +23,8 @@ The following table fully summarizes the contents of \emph{\link{collapse}}. The % (speed about 2x '[' for selecting and 4x '[<-' for replacing). %, get data, variables names, variable indices -\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type. -\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr +\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, combine, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type. +\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link{rowbind}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr \link[=quick-conversion]{Quick Data Conversion} \tab\tab Quick conversions: data.frame <> data.table <> tibble <> matrix (row- or column-wise) <> list | array > matrix, data.frame, data.table, tibble | vector > factor, matrix, data.frame, data.table, tibble; and converting factors / all factor columns. \tab\tab \code{qDF}, \code{qDT}, \code{qTBL}, \code{qM}, \code{qF}, \code{mrtl}, \code{mctl}, \code{as_numeric_factor}, \code{as_character_factor} \cr \cr \cr diff --git a/man/fast-data-manipulation.Rd b/man/fast-data-manipulation.Rd index a704a790..0460dc5c 100644 --- a/man/fast-data-manipulation.Rd +++ b/man/fast-data-manipulation.Rd @@ -12,6 +12,8 @@ \item \code{\link{add_vars}} efficiently adds new columns at any position within a data frame (default at the end). This can be done vie replacement (i.e. \code{add_vars(data) <- newdata}) or returning the appended data (i.e. \code{add_vars(data, newdata1, newdata2, \dots)}). Because of the latter, \code{add_vars} is also a more efficient alternative to \code{cbind.data.frame}. +\item \code{\link{rowbind}} efficiently combines data frames / lists row-wise. The implementation is derived from \code{data.table::rbindlist}, it is also a fast alternative to \code{rbind.data.frame}. + \item \code{\link{fsubset}} is a much faster version of \code{\link{subset}} to efficiently subset vectors, matrices and data frames. If the non-standard evaluation offered by \code{\link{fsubset}} is not needed, the function \code{\link{ss}} is a much faster and also more secure alternative to \code{[.data.frame}. \item \code{\link{fsummarise}} is a much faster version of \code{dplyr::summarise} when used together with the \link[=fast-statistical-functions]{Fast Statistical Functions} and \code{\link{fgroup_by}}, with whom it also supports super fast weighted aggregation. diff --git a/man/rowbind.Rd b/man/rowbind.Rd new file mode 100644 index 00000000..3f954991 --- /dev/null +++ b/man/rowbind.Rd @@ -0,0 +1,58 @@ +\name{rowbind} +\alias{rowbind} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ +Row-Binding Lists / Data Frame-Like Objects +} +\description{ +\emph{collapse}'s version of \code{data.table::rbindlist} and \code{rbind.data.frame}. The core code is copied from \emph{data.table}, which deserves all credit for the implementation. \code{rowbind} only binds (non-nested) lists /data.frame's. For a more flexible recursive version see \code{\link{unlist2d}}. To combine lists column-wise see \code{\link{add_vars}} or \code{\link{ftransform}} (with replacement). +} +\usage{ +rowbind(\dots, idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE, + return = c("as.first", "data.frame", "data.table", "tibble", "list")) +%- maybe also 'usage' for other objects documented here. +} +\arguments{ + \item{\dots}{a single list of list-like objects (data.frames) or comma separated objects (internally assembled using \code{list(\dots)}). Names can be supplied if \code{!is.null(idcol)}.} + + \item{idcol}{character. The name of an id-column to be generated identifying the source of rows in the final object. Using \code{idcol = TRUE} will set the name to \code{".id"}. If the input list has names, these will form the content of the id column, otherwise integers are used. To save memory, it is advised to keep \code{id.factor = TRUE}.} + + \item{use.names}{logical. \code{TRUE} binds by matching column name, \code{FALSE} by position. } + + \item{fill}{logical. \code{TRUE} fills missing columns with NAs. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.} + + \item{id.factor}{logical. if \code{TRUE}, \code{!isFALSE(idcols)}, and the input list is names, create id column as factor instead of character vector. It is also possible to specify \code{id.factor = "ordered"} to generate an ordered factor id. This is much more memory efficient than a character id, and thus enabled by default. } + +\item{return}{an integer or string specifying what to return. \code{1 - "as.first"} preserves the attributes of the first element of the list, \code{2/3/4 - "data.frame"/"data.table"/"tibble"} coerces to specific objects, and \code{5 - "list"} returns a (named) list. } + +} + +\value{ +a long list or data frame-like object formed by combining the rows / elements of the input objects. The \code{return} argument controls the exact format of the output. +} + + +\seealso{ +\code{\link{unlist2d}}, \code{\link{add_vars}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview} +} +\examples{ +# These are the same +rowbind(mtcars, mtcars) +rowbind(list(mtcars, mtcars)) + +# With id column +rowbind(mtcars, mtcars, idcol = "id") +rowbind(a = mtcars, b = mtcars, idcol = "id") # by default factor to save memory + +# Filling up columns +rowbind(mtcars, mtcars[2:8], fill = TRUE) +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory (show via RShowDoc("KEYWORDS")): +\keyword{manip} +% \keyword{ ~kwd2 } +% Use only one keyword per line. +% For non-standard keywords, use \concept instead of \keyword: +% \concept{ ~cpt1 } +% \concept{ ~cpt2 } +% Use only one concept per line. diff --git a/man/select_replace_vars.Rd b/man/select_replace_vars.Rd index 23ca1bf8..0c74f869 100644 --- a/man/select_replace_vars.Rd +++ b/man/select_replace_vars.Rd @@ -115,7 +115,7 @@ Thus they can freely be applied to data.table's, grouped tibbles, panel data fra In many cases functions here only check the length of the first column, which is one of the reasons why they are so fast. When lists of unequal-length columns are offered as replacements this yields a malformed data frame (which will also print a warning in the console i.e. you will notice that). } \seealso{ -\code{\link{fsubset}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview} +\code{\link{fsubset}}, \code{\link{ftransform}}, \code{\link{rowbind}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview} } \examples{ ## Wold Development Data diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R index 66cfceff..e592f73e 100644 --- a/tests/testthat/test-misc.R +++ b/tests/testthat/test-misc.R @@ -356,3 +356,12 @@ test_that("fdist works properly", { set_collapse(oldopts) } }) + +test_that("rowbind", { + expect_equal(rowbind(mtcars, mtcars), setRownames(rbind(mtcars, mtcars), rep(attr(mtcars, "row.names"), 2))) + expect_equal(setRownames(rowbind(list(mtcars, mtcars))), setRownames(rbind(mtcars, mtcars))) + expect_equal(setRownames(rowbind(mtcars, mtcars)), unlist2d(list(mtcars, mtcars), idcols = FALSE)) + expect_equal(setRownames(rowbind(mtcars, mtcars, idcol = "id")), unlist2d(list(mtcars, mtcars), idcols = "id")) + expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id")), unlist2d(list(a = mtcars, b = mtcars), idcols = "id", id.factor = TRUE)) + expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id", id.factor = FALSE)), unlist2d(list(a = mtcars, b = mtcars), idcols = "id")) +}) From 7d8f4c4424a66e6c75a732ba563c314f534f68a8 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Thu, 20 Jul 2023 14:13:51 +0200 Subject: [PATCH 4/5] Update date. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5317ec1d..2b8b314b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: collapse Title: Advanced and Fast Data Transformation Version: 1.9.6.9000 -Date: 2023-06-14 +Date: 2023-07-20 Authors@R: c( person("Sebastian", "Krantz", role = c("aut", "cre"), email = "sebastian.krantz@graduateinstitute.ch"), From fc170ab975243283e60e5dbb52b46cee300168a0 Mon Sep 17 00:00:00 2001 From: Sebastian Krantz Date: Thu, 20 Jul 2023 14:26:03 +0200 Subject: [PATCH 5/5] Removing empty line. --- man/rowbind.Rd | 1 - 1 file changed, 1 deletion(-) diff --git a/man/rowbind.Rd b/man/rowbind.Rd index 3f954991..e2e1a142 100644 --- a/man/rowbind.Rd +++ b/man/rowbind.Rd @@ -10,7 +10,6 @@ Row-Binding Lists / Data Frame-Like Objects \usage{ rowbind(\dots, idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE, return = c("as.first", "data.frame", "data.table", "tibble", "list")) -%- maybe also 'usage' for other objects documented here. } \arguments{ \item{\dots}{a single list of list-like objects (data.frames) or comma separated objects (internally assembled using \code{list(\dots)}). Names can be supplied if \code{!is.null(idcol)}.}