add table_format()

improves #24
DanChaltiel · Apr 18, 2024 · 0438b13 · 0438b13
1 parent 062bc86
commit 0438b13
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 2 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -42,6 +42,7 @@ export(reset_manual_correction)
 export(save_list)
 export(save_plotly)
 export(split_mixed_datasets)
+export(table_format)
 export(tibble)
 export(unify)
 export(waterfall_plot)

diff --git a/R/split_mixed.R b/R/split_mixed.R
@@ -1,4 +1,67 @@
 
+
+#' Identify if a dataframe has a long or a wide format
+#' 
+#' A dataset is either in the wide format or in the long format ([link](https://towardsdatascience.com/long-and-wide-formats-in-data-explained-e48d7c9a06cb)). 
+#' This function identifies the format of a dataframe with respect to a subject ID. 
+#' If a dataframe has some wide and long columns, it is considered "mixed".
+#' 
+#'
+#' @param df a dataframe
+#' @param id the identifying subject ID
+#' @param ... not used
+#' @param ignore_cols columns to ignore
+#' @param na_rm whether to consider missing values
+#' @param warn whether to warn if ID is not found
+#'
+#' @return a string value in `c("wide", "long", "mixed)`
+#' @export
+#' 
+#' @examples
+#' tm = edc_example_mixed()
+#' sapply(tm, table_format, warn=FALSE) 
+table_format = function(df, id=get_subjid_cols(), ..., 
+ ignore_cols=getOption("edc_cols_crfname", "CRFNAME"), 
+ na_rm=FALSE,
+ warn=TRUE){
+ mean_nval = .table_format(df=df, id=id, ignore_cols=ignore_cols, na_rm=na_rm, warn=warn)
+ if(is.null(mean_nval)) return(NULL)
+ if(all(mean_nval==1)) return("wide")
+ if(all(mean_nval>1)) return("long")
+ return("mixed")
+}
+
+
+#' @noRd
+#' @keywords internal
+.table_format = function(df, id=get_subjid_cols(), ..., 
+ ignore_cols=getOption("edc_cols_crfname", "CRFNAME"), 
+ na_rm=FALSE,
+ warn=TRUE){
+ check_dots_empty()
+ # .x = df
+ if(!is.data.frame(df)) return(NULL)
+ if(nrow(df)==0 || ncol(df)==0) return(NULL)
+
+ if(!any(tolower(id) %in% tolower(names(df)))){
+ if(isTRUE(warn)){
+ cli_warn("{.val {id}} was not found in {.arg df}. Returning {.val NULL}")
+ }
+ return(NULL)
+ }
+ f = if(isTRUE(na_rm)) na.omit else identity
+ mean_nval = df %>% 
+ select(subjid=any_of2(id), everything(), -any_of2(ignore_cols)) %>% 
+ summarise(across(everything(), ~length(unique(f(.x)))), 
+ .by=subjid) %>% 
+ select(-subjid) %>% 
+ map_dbl(mean)
+
+ if(any(is.na(mean_nval))) cli_abort("This should not happen, code 263467")
+ mean_nval
+}
+
+
 #' Split mixed datasets
 #' 
 #' Split mixed tables, i.e. tables that hold both long data (N values per patient) and short data (one value per patient, duplicated on N lines), into one long table and one short table.
@@ -33,7 +96,7 @@
 #' @importFrom cli cli_bullets cli_warn
 #' @importFrom dplyr across group_by select summarise summarise_all ungroup
 #' @importFrom glue glue
-#' @importFrom purrr discard imap keep keep_at list_flatten map_chr
+#' @importFrom purrr discard discard_at imap keep keep_at list_flatten map_chr
 #' @importFrom rlang check_dots_empty
 #' @importFrom tibble lst
 #' @importFrom tidyselect all_of everything
@@ -44,7 +107,7 @@ split_mixed_datasets = function(datasets=get_datasets(), id=get_subjid_cols(), .
  verbose=TRUE){
  check_dots_empty()
  if(is.data.frame(datasets)) datasets = list(datasets)
- datasets = datasets %>% keep(~is.data.frame(.x)) %>% keep_at(~.x!=".lookup")
+ datasets = datasets %>% keep(is.data.frame) %>% discard_at(".lookup")
 
  dataset_mean_nval = datasets %>% 
  imap(~{

diff --git a/man/table_format.Rd b/man/table_format.Rd