Skip to content

Commit

Permalink
Merge pull request #437 from SebKrantz/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
SebKrantz committed Jul 20, 2023
2 parents 7a3daf8 + fc170ab commit cbd32f3
Show file tree
Hide file tree
Showing 11 changed files with 115 additions and 6 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: collapse
Title: Advanced and Fast Data Transformation
Version: 1.9.6.9000
Date: 2023-06-14
Date: 2023-07-20
Authors@R: c(
person("Sebastian", "Krantz", role = c("aut", "cre"),
email = "[email protected]"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ importFrom("stats", "as.formula", "complete.cases", "cor", "cov", "var", "pt",
export(fnlevels)
export(roworder)
export(roworderv)
export(rowbind)
export(frename)
export(rnm)
export(setrename)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# collapse 1.9.6.9000

* Added `rowbind()`: a fast class-agnostic alternative to `rbind.data.frame()` and `data.table::rbindlist()`.

* Fixed a bug in the integer methods of `fsum()`, `fmean()` and `fprod()` that returned `NA` if and only if there was a single integer followed by `NA`'s e.g `fsum(c(1L, NA, NA))` erroneously gave `NA`. This was caused by a C-level shortcut that returned `NA` when the first element of the vector had been reached (moving from back to front) without encountering any non-NA-values. The bug consisted in the content of the first element not being evaluated in this case. Note that this bug did not occur with real numbers, and also not in grouped execution. Thanks @blset for reporting (#432).

# collapse 1.9.6
Expand Down
38 changes: 38 additions & 0 deletions R/unlist2d.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,42 @@

# rbind_list ??
rowbind <- function(..., idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE,
return = c("as.first", "data.frame", "data.table", "tibble", "list")) {

l <- if(...length() == 1L && is.list(..1)) unclass(..1) else list(...)
id_fact <- length(idcol) && !isFALSE(id.factor) && length(nam <- names(l))
if(id_fact) names(l) <- NULL
res <- .Call(C_rbindlist, l, use.names || fill, fill, idcol)
if(id_fact) {
attr(res[[1L]], "levels") <- nam
oldClass(res[[1L]]) <- switch(id.factor, `TRUE` = "factor", ordered = c("ordered", "factor"),
stop('id.factor needs to be FALSE, TRUE or "ordered"'))
}
switch(return[1L],
as.first = {
a1 <- attributes(.subset2(l, 1L))
if(is.null(a1)) return(res)
n <- .Call(C_fnrow, res)
if(any(a1$class == "data.frame")) {
rn <- a1$row.names
if(!(is.numeric(rn) || is.null(rn) || rn[1L] == "1")) { # data.frame's
all_rn <- do.call(c, lapply(unattrib(l), attr, "row.names"))
a1$row.names <- if(length(all_rn) == n) all_rn else .set_row_names(n)
} else a1$row.names <- .set_row_names(n)
}
a1$names <- names(res)
.Call(C_setattributes, res, a1)
if(any(a1$class == "data.table")) return(alc(res))
res
},
data.frame = qDF(res),
data.table = qDT(res),
tibble = qTBL(res),
list = res,
stop("Unknown return option: ", return[1L])
)
}

unlist2d <- function(l, idcols = ".id", row.names = FALSE, recursive = TRUE, id.factor = FALSE, DT = FALSE) {

if (!is.list(l)) return(l) # stop("l is not a list")
Expand Down
2 changes: 1 addition & 1 deletion man/GRP.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ Creating a factor from a 'GRP' object using \code{as_factor_GRP} does not involv
}
}
\seealso{
\code{\link{radixorder}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview}
\code{\link{radixorder}}, \code{\link{group}}, \code{\link{qF}}, \link[=fast-grouping-ordering]{Fast Grouping and Ordering}, \link[=collapse-documentation]{Collapse Overview}
}
\examples{
## default method
Expand Down
4 changes: 2 additions & 2 deletions man/collapse-documentation.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ The following table fully summarizes the contents of \emph{\link{collapse}}. The
% (speed about 2x '[' for selecting and 4x '[<-' for replacing). %, get data, variables names, variable indices
\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type.
\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr
\link[=fast-data-manipulation]{Fast Data Manipulation} \tab\tab Fast and flexible select, subset, summarise, mutate/transform, sort/reorder, combine, rename and relabel data. Some functions modify by reference and/or allow assignment. In addition a set of (standard evaluation) functions for fast selecting, replacing or adding data frame columns, including shortcuts to select and replace variables by data type.
\tab\tab \code{\link[=fselect]{fselect(<-)}}, \code{\link[=fsubset]{fsubset/ss}}, \code{\link{fsummarise}}, \code{\link{fmutate}}, \code{\link{across}}, \code{\link[=ftransform]{(f/set)transform(v)(<-)}}, \code{\link[=fcompute]{fcompute(v)}}, \code{\link[=roworder]{roworder(v)}}, \code{\link[=colorder]{colorder(v)}}, \code{\link{rowbind}}, \code{\link[=frename]{(f/set)rename}}, \code{\link[=relabel]{(set)relabel}}, \code{\link[=get_vars]{get_vars(<-)}}, \code{\link[=add_vars]{add_vars(<-)}}, \code{\link[=num_vars]{num_vars(<-)}}, \code{\link[=cat_vars]{cat_vars(<-)}}, \code{\link[=char_vars]{char_vars(<-)}}, \code{\link[=fact_vars]{fact_vars(<-)}}, \code{\link[=logi_vars]{logi_vars(<-)}}, \code{\link[=date_vars]{date_vars(<-)}} \cr \cr \cr
\link[=quick-conversion]{Quick Data Conversion} \tab\tab Quick conversions: data.frame <> data.table <> tibble <> matrix (row- or column-wise) <> list | array > matrix, data.frame, data.table, tibble | vector > factor, matrix, data.frame, data.table, tibble; and converting factors / all factor columns. \tab\tab \code{qDF}, \code{qDT}, \code{qTBL}, \code{qM}, \code{qF}, \code{mrtl}, \code{mctl}, \code{as_numeric_factor}, \code{as_character_factor} \cr \cr \cr
Expand Down
2 changes: 2 additions & 0 deletions man/fast-data-manipulation.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

\item \code{\link{add_vars}} efficiently adds new columns at any position within a data frame (default at the end). This can be done vie replacement (i.e. \code{add_vars(data) <- newdata}) or returning the appended data (i.e. \code{add_vars(data, newdata1, newdata2, \dots)}). Because of the latter, \code{add_vars} is also a more efficient alternative to \code{cbind.data.frame}.

\item \code{\link{rowbind}} efficiently combines data frames / lists row-wise. The implementation is derived from \code{data.table::rbindlist}, it is also a fast alternative to \code{rbind.data.frame}.

\item \code{\link{fsubset}} is a much faster version of \code{\link{subset}} to efficiently subset vectors, matrices and data frames. If the non-standard evaluation offered by \code{\link{fsubset}} is not needed, the function \code{\link{ss}} is a much faster and also more secure alternative to \code{[.data.frame}.

\item \code{\link{fsummarise}} is a much faster version of \code{dplyr::summarise} when used together with the \link[=fast-statistical-functions]{Fast Statistical Functions} and \code{\link{fgroup_by}}, with whom it also supports super fast weighted aggregation.
Expand Down
2 changes: 1 addition & 1 deletion man/psacf.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ psccf(x, y, \dots)
\details{
If \code{gscale = TRUE} data are standardized within each group (using \code{\link{fscale}}) such that the group-mean is 0 and the group-standard deviation is 1. This is strongly recommended for most panels to get rid of individual-specific heterogeneity which would corrupt the ACF computations.

After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then correlating this matrix with the series (\code{x, y}) using \code{\link{cor}} and pairwise-complete observations. This may require a lot of memory on large data, but is done because passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cor}} one time is much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}.
After scaling, \code{psacf}, \code{pspacf} and \code{psccf} compute the ACF/CCF by creating a matrix of panel-lags of the series using \code{\link{flag}} and then computing the covariance of this matrix with the series (\code{x, y}) using \code{\link{cov}} and pairwise-complete observations, and dividing by the variance (of \code{x, y}). Creating the lag matrix may require a lot of memory on large data, but passing a sequence of lags to \code{\link{flag}} and thus calling \code{\link{flag}} and \code{\link{cov}} one time is generally much faster than calling them \code{lag.max} times. The partial ACF is computed from the ACF using a Yule-Walker decomposition, in the same way as in \code{\link{pacf}}.
}
\value{
An object of class 'acf', see \code{\link{acf}}. The result is returned invisibly if \code{plot = TRUE}.}
Expand Down
57 changes: 57 additions & 0 deletions man/rowbind.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
\name{rowbind}
\alias{rowbind}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{
Row-Binding Lists / Data Frame-Like Objects
}
\description{
\emph{collapse}'s version of \code{data.table::rbindlist} and \code{rbind.data.frame}. The core code is copied from \emph{data.table}, which deserves all credit for the implementation. \code{rowbind} only binds (non-nested) lists /data.frame's. For a more flexible recursive version see \code{\link{unlist2d}}. To combine lists column-wise see \code{\link{add_vars}} or \code{\link{ftransform}} (with replacement).
}
\usage{
rowbind(\dots, idcol = NULL, use.names = TRUE, fill = FALSE, id.factor = TRUE,
return = c("as.first", "data.frame", "data.table", "tibble", "list"))
}
\arguments{
\item{\dots}{a single list of list-like objects (data.frames) or comma separated objects (internally assembled using \code{list(\dots)}). Names can be supplied if \code{!is.null(idcol)}.}

\item{idcol}{character. The name of an id-column to be generated identifying the source of rows in the final object. Using \code{idcol = TRUE} will set the name to \code{".id"}. If the input list has names, these will form the content of the id column, otherwise integers are used. To save memory, it is advised to keep \code{id.factor = TRUE}.}

\item{use.names}{logical. \code{TRUE} binds by matching column name, \code{FALSE} by position. }

\item{fill}{logical. \code{TRUE} fills missing columns with NAs. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.}

\item{id.factor}{logical. if \code{TRUE}, \code{!isFALSE(idcols)}, and the input list is names, create id column as factor instead of character vector. It is also possible to specify \code{id.factor = "ordered"} to generate an ordered factor id. This is much more memory efficient than a character id, and thus enabled by default. }

\item{return}{an integer or string specifying what to return. \code{1 - "as.first"} preserves the attributes of the first element of the list, \code{2/3/4 - "data.frame"/"data.table"/"tibble"} coerces to specific objects, and \code{5 - "list"} returns a (named) list. }

}

\value{
a long list or data frame-like object formed by combining the rows / elements of the input objects. The \code{return} argument controls the exact format of the output.
}


\seealso{
\code{\link{unlist2d}}, \code{\link{add_vars}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
}
\examples{
# These are the same
rowbind(mtcars, mtcars)
rowbind(list(mtcars, mtcars))

# With id column
rowbind(mtcars, mtcars, idcol = "id")
rowbind(a = mtcars, b = mtcars, idcol = "id") # by default factor to save memory

# Filling up columns
rowbind(mtcars, mtcars[2:8], fill = TRUE)
}
% Add one or more standard keywords, see file 'KEYWORDS' in the
% R documentation directory (show via RShowDoc("KEYWORDS")):
\keyword{manip}
% \keyword{ ~kwd2 }
% Use only one keyword per line.
% For non-standard keywords, use \concept instead of \keyword:
% \concept{ ~cpt1 }
% \concept{ ~cpt2 }
% Use only one concept per line.
2 changes: 1 addition & 1 deletion man/select_replace_vars.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Thus they can freely be applied to data.table's, grouped tibbles, panel data fra
In many cases functions here only check the length of the first column, which is one of the reasons why they are so fast. When lists of unequal-length columns are offered as replacements this yields a malformed data frame (which will also print a warning in the console i.e. you will notice that).
}
\seealso{
\code{\link{fsubset}}, \code{\link{ftransform}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
\code{\link{fsubset}}, \code{\link{ftransform}}, \code{\link{rowbind}}, \link[=fast-data-manipulation]{Data Frame Manipulation}, \link[=collapse-documentation]{Collapse Overview}
}
\examples{
## Wold Development Data
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/test-misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,12 @@ test_that("fdist works properly", {
set_collapse(oldopts)
}
})

test_that("rowbind", {
expect_equal(rowbind(mtcars, mtcars), setRownames(rbind(mtcars, mtcars), rep(attr(mtcars, "row.names"), 2)))
expect_equal(setRownames(rowbind(list(mtcars, mtcars))), setRownames(rbind(mtcars, mtcars)))
expect_equal(setRownames(rowbind(mtcars, mtcars)), unlist2d(list(mtcars, mtcars), idcols = FALSE))
expect_equal(setRownames(rowbind(mtcars, mtcars, idcol = "id")), unlist2d(list(mtcars, mtcars), idcols = "id"))
expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id")), unlist2d(list(a = mtcars, b = mtcars), idcols = "id", id.factor = TRUE))
expect_equal(setRownames(rowbind(a = mtcars, b = mtcars, idcol = "id", id.factor = FALSE)), unlist2d(list(a = mtcars, b = mtcars), idcols = "id"))
})

0 comments on commit cbd32f3

Please sign in to comment.