Skip to content

Commit

Permalink
add table_format()
Browse files Browse the repository at this point in the history
improves #24
  • Loading branch information
DanChaltiel committed Apr 18, 2024
1 parent 062bc86 commit 0438b13
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export(reset_manual_correction)
export(save_list)
export(save_plotly)
export(split_mixed_datasets)
export(table_format)
export(tibble)
export(unify)
export(waterfall_plot)
Expand Down
67 changes: 65 additions & 2 deletions R/split_mixed.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,67 @@


#' Identify if a dataframe has a long or a wide format
#'
#' A dataset is either in the wide format or in the long format ([link](https://towardsdatascience.com/long-and-wide-formats-in-data-explained-e48d7c9a06cb)).
#' This function identifies the format of a dataframe with respect to a subject ID.
#' If a dataframe has some wide and long columns, it is considered "mixed".
#'
#'
#' @param df a dataframe
#' @param id the identifying subject ID
#' @param ... not used
#' @param ignore_cols columns to ignore
#' @param na_rm whether to consider missing values
#' @param warn whether to warn if ID is not found
#'
#' @return a string value in `c("wide", "long", "mixed)`
#' @export
#'
#' @examples
#' tm = edc_example_mixed()
#' sapply(tm, table_format, warn=FALSE)
table_format = function(df, id=get_subjid_cols(), ...,
ignore_cols=getOption("edc_cols_crfname", "CRFNAME"),
na_rm=FALSE,
warn=TRUE){
mean_nval = .table_format(df=df, id=id, ignore_cols=ignore_cols, na_rm=na_rm, warn=warn)
if(is.null(mean_nval)) return(NULL)
if(all(mean_nval==1)) return("wide")
if(all(mean_nval>1)) return("long")
return("mixed")
}


#' @noRd
#' @keywords internal
.table_format = function(df, id=get_subjid_cols(), ...,
ignore_cols=getOption("edc_cols_crfname", "CRFNAME"),
na_rm=FALSE,
warn=TRUE){
check_dots_empty()
# .x = df
if(!is.data.frame(df)) return(NULL)
if(nrow(df)==0 || ncol(df)==0) return(NULL)

if(!any(tolower(id) %in% tolower(names(df)))){
if(isTRUE(warn)){
cli_warn("{.val {id}} was not found in {.arg df}. Returning {.val NULL}")
}
return(NULL)
}
f = if(isTRUE(na_rm)) na.omit else identity
mean_nval = df %>%
select(subjid=any_of2(id), everything(), -any_of2(ignore_cols)) %>%
summarise(across(everything(), ~length(unique(f(.x)))),
.by=subjid) %>%
select(-subjid) %>%
map_dbl(mean)

if(any(is.na(mean_nval))) cli_abort("This should not happen, code 263467")
mean_nval
}


#' Split mixed datasets
#'
#' Split mixed tables, i.e. tables that hold both long data (N values per patient) and short data (one value per patient, duplicated on N lines), into one long table and one short table.
Expand Down Expand Up @@ -33,7 +96,7 @@
#' @importFrom cli cli_bullets cli_warn
#' @importFrom dplyr across group_by select summarise summarise_all ungroup
#' @importFrom glue glue
#' @importFrom purrr discard imap keep keep_at list_flatten map_chr
#' @importFrom purrr discard discard_at imap keep keep_at list_flatten map_chr
#' @importFrom rlang check_dots_empty
#' @importFrom tibble lst
#' @importFrom tidyselect all_of everything
Expand All @@ -44,7 +107,7 @@ split_mixed_datasets = function(datasets=get_datasets(), id=get_subjid_cols(), .
verbose=TRUE){
check_dots_empty()
if(is.data.frame(datasets)) datasets = list(datasets)
datasets = datasets %>% keep(~is.data.frame(.x)) %>% keep_at(~.x!=".lookup")
datasets = datasets %>% keep(is.data.frame) %>% discard_at(".lookup")

dataset_mean_nval = datasets %>%
imap(~{
Expand Down
40 changes: 40 additions & 0 deletions man/table_format.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 0438b13

Please sign in to comment.