Merge branch 'master' of github.com:stephenslab/dsc

stephenslab · Dec 14, 2019 · c891e7c · c891e7c
2 parents fb24bd4 + 64d1f71
commit c891e7c
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 49 deletions.
diff --git a/dscrutils/R/dscquery.R b/dscrutils/R/dscquery.R
@@ -39,7 +39,7 @@
 #' for testing or debugging. Note that any module or module group
 #' included in \code{module.output.files} must also be included in
 #' \code{targets}.
-#' 
+#'
 #' @param conditions Conditions used to filter DSC pipeline results;
 #' rows in which one or more of the conditions evaluate to
 #' \code{FALSE} or \code{NA} are removed from the output (removing
@@ -71,7 +71,7 @@
 #' expert users for testing or to reproduce previous queries since the
 #' \code{dsc-query} output file must exactly agree in the query
 #' arguments, otherwise unexpected errors could occur.
-#' 
+#'
 #' @param return.type If \code{return.type = "data.frame"}, the DSC
 #' outputs are returned in a data frame; if \code{return.type =
 #' "list"}, the DSC output a list. If \code{return.type = "auto"}, a
@@ -81,7 +81,7 @@
 #' limitations) of each. Note that \code{return.type = "data.frame"}
 #' cannot be used when one or more modules or module groups are named
 #' in \code{module.output.files}.
-#' 
+#'
 #' @param ignore.missing.files If \code{ignore.missing.files = TRUE},
 #' all targets corresponding to DSC output files that cannot be found,
 #' or cannot be read (e.g., because they are corrupted), will be
@@ -108,7 +108,7 @@
 #' When \code{return.type = "list"}, the output is a list, with list
 #' elements corresponding to the query targets. Each top-level list
 #' element should have the same length.
-#' 
+#'
 #' When \code{return.type = "auto"}, DSC outputs are extracted into
 #' the columns of the data frame unless one or more outputs are large
 #' or complex objects, in which case the return value is a list.
@@ -138,7 +138,7 @@
 #' This function may not work in Windows.
 #'
 #' @seealso \code{\link{dscread}}
-#' 
+#'
 #' @examples
 #'
 #' # Retrieve the number of samples ("simulate.n") and error summary
@@ -148,7 +148,7 @@
 #' "dsc_result",package = "dscrutils")
 #' dat1 <- dscquery(dsc.dir,
 #' targets = c("simulate.n","analyze","score.error"))
-#' 
+#'
 #' # Retrieve the results only for simulations in which the "mean" module
 #' # was run. Because this is a condition for a module name, it is
 #' # applied before loading the full set of results into R. Therefore,
@@ -174,7 +174,7 @@
 #' targets = c("simulate.n","analyze","score.error"),
 #' conditions = c("$(score.error) > 0.2",
 #' "$(analyze) == 'median'"))
-#' 
+#'
 #' # Retrieve some results from the "ash" DSC experiment. In this
 #' # example, the beta estimates are vectors, so the results are
 #' # extracted into a list by default.
@@ -185,7 +185,7 @@
 #' targets = c("simulate.nsamp","simulate.g","shrink.mixcompdist",
 #' "shrink.beta_est","shrink.pi0_est"),
 #' conditions = "$(simulate.g)=='list(c(2/3,1/3),c(0,0),c(1,2))'")
-#' 
+#'
 #' # This is the same as the previous example, but extracts the results
 #' # into data frame. Since the vectors cannot be stored in a data frame,
 #' # the names of the files storing the vectors are returned instead.
@@ -197,7 +197,7 @@
 #' return.type = "data.frame")
 #'
 #' # See also example("dscread").
-#' 
+#'
 #' @importFrom data.table fread
 #' @importFrom progress progress_bar
 #'
@@ -215,7 +215,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  # Check input argument "dsc.outdir".
  if (!(is.character(dsc.outdir) & length(dsc.outdir) == 1))
  stop("Argument \"dsc.outdir\" should be a character vector of length 1")
- 
+
  # Check and process input argument "targets".
  if (!(is.character(targets) & is.vector(targets) & length(targets) > 0))
  stop(paste("Argument \"targets\" should be a character vector with",
@@ -246,7 +246,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  "\"module.output.files\" must also be included in",
  "\"targets\""))
  }
- 
+
  # Check input argument "conditions".
  if (!is.null(conditions))
  if (!(is.character(conditions) & is.vector(conditions) &
@@ -259,13 +259,13 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  if (!(is.character(groups) & is.vector(groups) & length(groups) > 0))
  stop(paste("Argument \"groups\" should be \"NULL\", or a character",
  "vector with at least one element"))
- 
+
  # Check input argument "dsc.outfile".
  if (!is.null(dsc.outfile))
  if (!(is.character(dsc.outfile) & length(dsc.outfile) == 1))
  stop(paste("Argument \"dsc.outfile\" should either be \"NULL\" or a",
  "character vector of length 1"))
- 
+
  # Check and process input argument "return.type".
  return.type <- match.arg(return.type)
  if (return.type == "data.frame" & length(module.output.all) > 1)
@@ -276,7 +276,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  # Check input argument "ignore.missing.files".
  if (!(is.logical(ignore.missing.files) & length(ignore.missing.files) == 1))
  stop("Argument \"ignore.missing.files\" should be TRUE or FALSE")
- 
+
  # Check input argument "exec".
  if (!(is.character(exec) & length(exec) == 1))
  stop("Argument \"exec\" should be a character vector of length 1")
@@ -327,7 +327,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  "command failed (returned a non-zero exit status)"))
  }
  }
- 
+
  # IMPORT DSC QUERY RESULTS
  # ------------------------
  # As a safeguard, we check for any duplicated column (or list
@@ -336,7 +336,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  class(dat) <- "data.frame"
  if (any(duplicated(names(dat))))
  stop("One or more names in dsc-query output are the same")
- 
+
  # PRE-FILTER BY CONDITIONS
  # ------------------------
  # Filter rows of the data frame by each condition. If one or more
@@ -440,7 +440,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  }
  dat <- as.data.frame(dat,check.names = FALSE,stringsAsFactors = FALSE)
  } else if (return.type == "auto") {
- 
+
  # If all the outputs can be stored in a data frame, do so.
  dat <- flatten.nested.list(dat)
  if (all(!sapply(dat,is.list)))
@@ -458,8 +458,8 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
  "may be more convenient for analyzing these results"))
  }
  rm(dat.unextracted)
- 
- # POST-FILTER BY CONDITIONS 
+
+ # POST-FILTER BY CONDITIONS
  # -------------------------
  # Filter rows of the data frame (or list) by each condition.
  # This is second filtering step is necessary to take care of any
@@ -570,7 +570,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
 
  # Determine which columns contain names of files that should be
  # read; these are columns of the form "module.variable:output". If
- # there are no such columns, there is nothing to do here. 
+ # there are no such columns, there is nothing to do here.
  cols <- which(sapply(as.list(names(dat)),is.output.column))
  if (length(cols) == 0)
  return(dat)
@@ -601,7 +601,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
  if (!is.na(j))
  out[[j]][[x]] <- NA
  }
- 
+
  # Extract the outputs.
  if (verbose)
  pb <-
@@ -618,7 +618,8 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
  if (j == "DSC_TIME")
  out[[i]][[j]] <- x$DSC_DEBUG$time$elapsed
  else if (!is.element(j,names(x)))
- stop(sprintf("Variable \"%s\" unavailable in \"%s\"",j,i))
+ # https://github.com/stephenslab/dsc/issues/202
+ out[[i]][j] <- NA
  else
  out[[i]][j] <- list(x[[j]])
  }
@@ -635,7 +636,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
  dat[[i]][j] <- list(out[[file]][[v]])
  }
  }
- 
+
  return(dat)
 }
 
@@ -656,7 +657,7 @@ import.dsc.output <- function (outfile, outdir, ignore.missing.files) {
 flatten.nested.list <- function (x) {
  n <- length(x)
  for (i in 1:n)
- 
+
  # If all the list elements are atomic, not NULL, and scalar
  # (i.e., length of 1), then the values can be "flattened" as a vector.
  # If not, then there is nothing to be done.

diff --git a/setup.py b/setup.py
@@ -53,7 +53,7 @@ def run(self):
  cmdclass = cmdclass,
  package_dir = {'dsc': 'src'},
  install_requires = ['numpy', 'pandas>=0.24.1', 'sympy', 'numexpr',
- 'sos>=0.20.11', 'sos-pbs>=0.20.1', 'h5py', 'PTable',
+ 'sos>=0.20.12', 'sos-pbs>=0.20.1', 'h5py', 'PTable',
  'pyarrow>=0.5.0', 'sqlalchemy', 'tzlocal',
  'msgpack-python']
  )
diff --git a/src/__main__.py b/src/__main__.py
@@ -182,7 +182,8 @@ def execute(args, unknown_args):
  env.logger.debug(f"Running command ``{' '.join(sys.argv)}``")
  env.logger.info(f"Building execution graph & running DSC ...")
  try:
- settings['error_mode'] = args.error_mode
+ if not args.error_mode == 'ignore-safe':
+ settings['error_mode'] = args.error_mode
  settings['verbosity'] = args.verbosity if args.host else max(
  0, args.verbosity - 1)
  settings['output_dag'] = f'{db}.dot' if args.__dag__ else None
@@ -277,7 +278,7 @@ def error(self, message):
  "strict": skips jobs whose input, output and code have not been changed since previous execution.
  "lenient": skips jobs whose output timestamp are newer than their input.
  It can be used to avoid re-run when nuisent changes are made to module scripts that should not impact results.
- "existing": skips jobs whose output exists, and mark existing output as "up-to-date" for future re-runs. 
+ "existing": skips jobs whose output exists, and mark existing output as "up-to-date" for future re-runs.
  It can be used to avoid re-run completely even after file status cache have been deleted
  (eg, after partially transferring output data from one location to another).
  "all": skips all modules and only build meta-database required to run `dsc-query` command.
@@ -289,21 +290,21 @@ def error(self, message):
  help=SUPPRESS)
  mt.add_argument('-e',
  metavar='option',
- choices=['default', 'ignore', 'abort'],
+ choices=['ignore-safe', 'ignore', 'abort'],
  dest='error_mode',
- default="default",
+ default="ignore-safe",
  help='''How DSC responds to errors.
- "abort": stop all modules immediately after an error occurs.
- "ignore": ignore errors and try to complete the benchmark.
- "default": stop modules with errors or has errors in their upstream modules.
+ "abort": stop all module instances immediately after an error occurs.
+ "ignore": ignore all errors and try to complete the benchmark.
+ "ignore-safe": only stop module instances with errors or has errors in their upstream modules.
  ''')
  mt.add_argument('-d',
  metavar="option",
  choices=["obsolete", "replace", "all"],
  dest='to_remove',
  help='''How DSC deletes benchmark files.
- Use option "all" to remove all output from the current benchmark. 
- "obsolete", when used without "--target", removes from output folder anything irrelevant 
+ Use option "all" to remove all output from the current benchmark.
+ "obsolete", when used without "--target", removes from output folder anything irrelevant
  to the most recent successful execution of the benchmark.
  When used with "--target" it deletes specified files, or files from specified modules or module groups.
  "replace", when used with "--target", deletes files as option "obsolete" does with "--target",

diff --git a/src/dsc_translator.py b/src/dsc_translator.py
@@ -340,7 +340,7 @@ def get_input(self):
  self.loop_string[1] = f'for __i__ in {self.input_vars}'
  else:
  if len(self.current_depends):
- self.input_string += "parameter: {0}_input_files = list\ninput: dynamic({0}_input_files)".\
+ self.input_string += "parameter: {0}_input_files = list\ninput: {0}_input_files".\
  format(self.step.name)
  self.input_option.append(
  f'group_by = {len(self.current_depends)}')

diff --git a/src/line.py b/src/line.py
@@ -589,8 +589,8 @@ def expand_logic(string):
  string = string.replace(' OR ', '|')
  string = string.replace(' and ', '&')
  string = string.replace(' AND ', '&')
- string = string.replace(' not ', '~')
- string = string.replace(' NOT ', '~')
+ string = string.replace('not ', '~')
+ string = string.replace('NOT ', '~')
  quote_dict = dict()
  for idx, m in enumerate(re.findall(r"\"[^\"]+\"|'[^']+'", string)):
  # - Match either of the following options

diff --git a/src/query_engine.py b/src/query_engine.py
@@ -65,13 +65,16 @@ def __init__(self, db, targets, condition=None, groups=None):
  ])
  else:
  self.depends = None
+ # https://github.com/stephenslab/dsc/issues/202
+ self.output_checklist = dict(valid={}, invalid={})
  # 1. Check overlapping groups and fix the case when some module in the group has some parameter but others do not
  # changes will be applied to self.data
  self.groups.update(self.get_grouped_tables(groups))
  self.check_overlapping_groups()
  self.add_na_group_parameters()
  # 2. Get query targets and conditions
  self.target_tables = self.get_table_fields(self.targets)
+ self.check_output_variables()
  self.condition, self.condition_tables = parse_filter(
  condition, groups=self.groups)
  # 3. only keep tables that do exist in database
@@ -146,12 +149,36 @@ def check_table_field(self, value, check_field=0):
  raise DBError(f"Cannot find column ``{y}`` in table ``{k}``")
  if y_low.startswith('output.'):
  y_low = y_low[7:]
- if y_low not in [i.lower() for i in self.data[k]] and y_low not in [
- i.lower() for i in self.data['.output'][k]
- ] and check_field == 1:
- raise DBError(f"Cannot find variable ``{y}`` in module ``{k}``")
+ if check_field == 1:
+ if y_low not in [i.lower() for i in self.data[k]] and y_low not in [
+ i.lower() for i in self.data['.output'][k]]:
+ try:
+ self.output_checklist['invalid'][y].append(k)
+ except Exception:
+ self.output_checklist['invalid'][y] = [k]
+ else:
+ try:
+ self.output_checklist['valid'][y].append(k)
+ except Exception:
+ self.output_checklist['valid'][y] = [k]
  return
 
+ def check_output_variables(self):
+ for k in self.output_checklist['invalid']:
+ if k not in self.output_checklist['valid']:
+ raise DBError(f"Cannot find variable ``{k}`` in module ``{', '.join(self.output_checklist['invalid'][k])}``")
+ # check if the variable is in the same group
+ # eg, {'valid': {'alpha': ['elastic_net'], 'beta': ['ridge', 'elastic_net']}, 'invalid': {'alpha': ['ridge']}}
+ # is okay because of group {'fit': ['ridge', 'elastic_net']}
+ for i in self.output_checklist['invalid'][k]:
+ is_valid = []
+ for j in self.output_checklist['valid'][k]:
+ is_valid.extend([set([i,j]).issubset(set(s)) for g,s in self.groups.items()])
+ if not any(is_valid):
+ raise DBError(f"Cannot find variable ``{k}`` in module ``{i}``")
+ return
+
+
  @staticmethod
  def get_grouped_tables(groups):
  '''