Skip to content

Commit

Permalink
Merge branch 'master' of github.com:stephenslab/dsc
Browse files Browse the repository at this point in the history
  • Loading branch information
gaow committed Dec 14, 2019
2 parents fb24bd4 + 64d1f71 commit c891e7c
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 49 deletions.
49 changes: 25 additions & 24 deletions dscrutils/R/dscquery.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
#' for testing or debugging. Note that any module or module group
#' included in \code{module.output.files} must also be included in
#' \code{targets}.
#'
#'
#' @param conditions Conditions used to filter DSC pipeline results;
#' rows in which one or more of the conditions evaluate to
#' \code{FALSE} or \code{NA} are removed from the output (removing
Expand Down Expand Up @@ -71,7 +71,7 @@
#' expert users for testing or to reproduce previous queries since the
#' \code{dsc-query} output file must exactly agree in the query
#' arguments, otherwise unexpected errors could occur.
#'
#'
#' @param return.type If \code{return.type = "data.frame"}, the DSC
#' outputs are returned in a data frame; if \code{return.type =
#' "list"}, the DSC output a list. If \code{return.type = "auto"}, a
Expand All @@ -81,7 +81,7 @@
#' limitations) of each. Note that \code{return.type = "data.frame"}
#' cannot be used when one or more modules or module groups are named
#' in \code{module.output.files}.
#'
#'
#' @param ignore.missing.files If \code{ignore.missing.files = TRUE},
#' all targets corresponding to DSC output files that cannot be found,
#' or cannot be read (e.g., because they are corrupted), will be
Expand All @@ -108,7 +108,7 @@
#' When \code{return.type = "list"}, the output is a list, with list
#' elements corresponding to the query targets. Each top-level list
#' element should have the same length.
#'
#'
#' When \code{return.type = "auto"}, DSC outputs are extracted into
#' the columns of the data frame unless one or more outputs are large
#' or complex objects, in which case the return value is a list.
Expand Down Expand Up @@ -138,7 +138,7 @@
#' This function may not work in Windows.
#'
#' @seealso \code{\link{dscread}}
#'
#'
#' @examples
#'
#' # Retrieve the number of samples ("simulate.n") and error summary
Expand All @@ -148,7 +148,7 @@
#' "dsc_result",package = "dscrutils")
#' dat1 <- dscquery(dsc.dir,
#' targets = c("simulate.n","analyze","score.error"))
#'
#'
#' # Retrieve the results only for simulations in which the "mean" module
#' # was run. Because this is a condition for a module name, it is
#' # applied before loading the full set of results into R. Therefore,
Expand All @@ -174,7 +174,7 @@
#' targets = c("simulate.n","analyze","score.error"),
#' conditions = c("$(score.error) > 0.2",
#' "$(analyze) == 'median'"))
#'
#'
#' # Retrieve some results from the "ash" DSC experiment. In this
#' # example, the beta estimates are vectors, so the results are
#' # extracted into a list by default.
Expand All @@ -185,7 +185,7 @@
#' targets = c("simulate.nsamp","simulate.g","shrink.mixcompdist",
#' "shrink.beta_est","shrink.pi0_est"),
#' conditions = "$(simulate.g)=='list(c(2/3,1/3),c(0,0),c(1,2))'")
#'
#'
#' # This is the same as the previous example, but extracts the results
#' # into data frame. Since the vectors cannot be stored in a data frame,
#' # the names of the files storing the vectors are returned instead.
Expand All @@ -197,7 +197,7 @@
#' return.type = "data.frame")
#'
#' # See also example("dscread").
#'
#'
#' @importFrom data.table fread
#' @importFrom progress progress_bar
#'
Expand All @@ -215,7 +215,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
# Check input argument "dsc.outdir".
if (!(is.character(dsc.outdir) & length(dsc.outdir) == 1))
stop("Argument \"dsc.outdir\" should be a character vector of length 1")

# Check and process input argument "targets".
if (!(is.character(targets) & is.vector(targets) & length(targets) > 0))
stop(paste("Argument \"targets\" should be a character vector with",
Expand Down Expand Up @@ -246,7 +246,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
"\"module.output.files\" must also be included in",
"\"targets\""))
}

# Check input argument "conditions".
if (!is.null(conditions))
if (!(is.character(conditions) & is.vector(conditions) &
Expand All @@ -259,13 +259,13 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
if (!(is.character(groups) & is.vector(groups) & length(groups) > 0))
stop(paste("Argument \"groups\" should be \"NULL\", or a character",
"vector with at least one element"))

# Check input argument "dsc.outfile".
if (!is.null(dsc.outfile))
if (!(is.character(dsc.outfile) & length(dsc.outfile) == 1))
stop(paste("Argument \"dsc.outfile\" should either be \"NULL\" or a",
"character vector of length 1"))

# Check and process input argument "return.type".
return.type <- match.arg(return.type)
if (return.type == "data.frame" & length(module.output.all) > 1)
Expand All @@ -276,7 +276,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
# Check input argument "ignore.missing.files".
if (!(is.logical(ignore.missing.files) & length(ignore.missing.files) == 1))
stop("Argument \"ignore.missing.files\" should be TRUE or FALSE")

# Check input argument "exec".
if (!(is.character(exec) & length(exec) == 1))
stop("Argument \"exec\" should be a character vector of length 1")
Expand Down Expand Up @@ -327,7 +327,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
"command failed (returned a non-zero exit status)"))
}
}

# IMPORT DSC QUERY RESULTS
# ------------------------
# As a safeguard, we check for any duplicated column (or list
Expand All @@ -336,7 +336,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
class(dat) <- "data.frame"
if (any(duplicated(names(dat))))
stop("One or more names in dsc-query output are the same")

# PRE-FILTER BY CONDITIONS
# ------------------------
# Filter rows of the data frame by each condition. If one or more
Expand Down Expand Up @@ -440,7 +440,7 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
}
dat <- as.data.frame(dat,check.names = FALSE,stringsAsFactors = FALSE)
} else if (return.type == "auto") {

# If all the outputs can be stored in a data frame, do so.
dat <- flatten.nested.list(dat)
if (all(!sapply(dat,is.list)))
Expand All @@ -458,8 +458,8 @@ dscquery <- function (dsc.outdir, targets = NULL, module.output.all = NULL,
"may be more convenient for analyzing these results"))
}
rm(dat.unextracted)
# POST-FILTER BY CONDITIONS

# POST-FILTER BY CONDITIONS
# -------------------------
# Filter rows of the data frame (or list) by each condition.
# This is second filtering step is necessary to take care of any
Expand Down Expand Up @@ -570,7 +570,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {

# Determine which columns contain names of files that should be
# read; these are columns of the form "module.variable:output". If
# there are no such columns, there is nothing to do here.
# there are no such columns, there is nothing to do here.
cols <- which(sapply(as.list(names(dat)),is.output.column))
if (length(cols) == 0)
return(dat)
Expand Down Expand Up @@ -601,7 +601,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
if (!is.na(j))
out[[j]][[x]] <- NA
}

# Extract the outputs.
if (verbose)
pb <-
Expand All @@ -618,7 +618,8 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
if (j == "DSC_TIME")
out[[i]][[j]] <- x$DSC_DEBUG$time$elapsed
else if (!is.element(j,names(x)))
stop(sprintf("Variable \"%s\" unavailable in \"%s\"",j,i))
# https://github.com/stephenslab/dsc/issues/202
out[[i]][j] <- NA
else
out[[i]][j] <- list(x[[j]])
}
Expand All @@ -635,7 +636,7 @@ read.dsc.outputs <- function (dat, dsc.outdir, ignore.missing.files, verbose) {
dat[[i]][j] <- list(out[[file]][[v]])
}
}

return(dat)
}

Expand All @@ -656,7 +657,7 @@ import.dsc.output <- function (outfile, outdir, ignore.missing.files) {
flatten.nested.list <- function (x) {
n <- length(x)
for (i in 1:n)

# If all the list elements are atomic, not NULL, and scalar
# (i.e., length of 1), then the values can be "flattened" as a vector.
# If not, then there is nothing to be done.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def run(self):
cmdclass = cmdclass,
package_dir = {'dsc': 'src'},
install_requires = ['numpy', 'pandas>=0.24.1', 'sympy', 'numexpr',
'sos>=0.20.11', 'sos-pbs>=0.20.1', 'h5py', 'PTable',
'sos>=0.20.12', 'sos-pbs>=0.20.1', 'h5py', 'PTable',
'pyarrow>=0.5.0', 'sqlalchemy', 'tzlocal',
'msgpack-python']
)
19 changes: 10 additions & 9 deletions src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ def execute(args, unknown_args):
env.logger.debug(f"Running command ``{' '.join(sys.argv)}``")
env.logger.info(f"Building execution graph & running DSC ...")
try:
settings['error_mode'] = args.error_mode
if not args.error_mode == 'ignore-safe':
settings['error_mode'] = args.error_mode
settings['verbosity'] = args.verbosity if args.host else max(
0, args.verbosity - 1)
settings['output_dag'] = f'{db}.dot' if args.__dag__ else None
Expand Down Expand Up @@ -277,7 +278,7 @@ def error(self, message):
"strict": skips jobs whose input, output and code have not been changed since previous execution.
"lenient": skips jobs whose output timestamp are newer than their input.
It can be used to avoid re-run when nuisent changes are made to module scripts that should not impact results.
"existing": skips jobs whose output exists, and mark existing output as "up-to-date" for future re-runs.
"existing": skips jobs whose output exists, and mark existing output as "up-to-date" for future re-runs.
It can be used to avoid re-run completely even after file status cache have been deleted
(eg, after partially transferring output data from one location to another).
"all": skips all modules and only build meta-database required to run `dsc-query` command.
Expand All @@ -289,21 +290,21 @@ def error(self, message):
help=SUPPRESS)
mt.add_argument('-e',
metavar='option',
choices=['default', 'ignore', 'abort'],
choices=['ignore-safe', 'ignore', 'abort'],
dest='error_mode',
default="default",
default="ignore-safe",
help='''How DSC responds to errors.
"abort": stop all modules immediately after an error occurs.
"ignore": ignore errors and try to complete the benchmark.
"default": stop modules with errors or has errors in their upstream modules.
"abort": stop all module instances immediately after an error occurs.
"ignore": ignore all errors and try to complete the benchmark.
"ignore-safe": only stop module instances with errors or has errors in their upstream modules.
''')
mt.add_argument('-d',
metavar="option",
choices=["obsolete", "replace", "all"],
dest='to_remove',
help='''How DSC deletes benchmark files.
Use option "all" to remove all output from the current benchmark.
"obsolete", when used without "--target", removes from output folder anything irrelevant
Use option "all" to remove all output from the current benchmark.
"obsolete", when used without "--target", removes from output folder anything irrelevant
to the most recent successful execution of the benchmark.
When used with "--target" it deletes specified files, or files from specified modules or module groups.
"replace", when used with "--target", deletes files as option "obsolete" does with "--target",
Expand Down
2 changes: 1 addition & 1 deletion src/dsc_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def get_input(self):
self.loop_string[1] = f'for __i__ in {self.input_vars}'
else:
if len(self.current_depends):
self.input_string += "parameter: {0}_input_files = list\ninput: dynamic({0}_input_files)".\
self.input_string += "parameter: {0}_input_files = list\ninput: {0}_input_files".\
format(self.step.name)
self.input_option.append(
f'group_by = {len(self.current_depends)}')
Expand Down
4 changes: 2 additions & 2 deletions src/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,8 @@ def expand_logic(string):
string = string.replace(' OR ', '|')
string = string.replace(' and ', '&')
string = string.replace(' AND ', '&')
string = string.replace(' not ', '~')
string = string.replace(' NOT ', '~')
string = string.replace('not ', '~')
string = string.replace('NOT ', '~')
quote_dict = dict()
for idx, m in enumerate(re.findall(r"\"[^\"]+\"|'[^']+'", string)):
# - Match either of the following options
Expand Down
35 changes: 31 additions & 4 deletions src/query_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,16 @@ def __init__(self, db, targets, condition=None, groups=None):
])
else:
self.depends = None
# https://github.com/stephenslab/dsc/issues/202
self.output_checklist = dict(valid={}, invalid={})
# 1. Check overlapping groups and fix the case when some module in the group has some parameter but others do not
# changes will be applied to self.data
self.groups.update(self.get_grouped_tables(groups))
self.check_overlapping_groups()
self.add_na_group_parameters()
# 2. Get query targets and conditions
self.target_tables = self.get_table_fields(self.targets)
self.check_output_variables()
self.condition, self.condition_tables = parse_filter(
condition, groups=self.groups)
# 3. only keep tables that do exist in database
Expand Down Expand Up @@ -146,12 +149,36 @@ def check_table_field(self, value, check_field=0):
raise DBError(f"Cannot find column ``{y}`` in table ``{k}``")
if y_low.startswith('output.'):
y_low = y_low[7:]
if y_low not in [i.lower() for i in self.data[k]] and y_low not in [
i.lower() for i in self.data['.output'][k]
] and check_field == 1:
raise DBError(f"Cannot find variable ``{y}`` in module ``{k}``")
if check_field == 1:
if y_low not in [i.lower() for i in self.data[k]] and y_low not in [
i.lower() for i in self.data['.output'][k]]:
try:
self.output_checklist['invalid'][y].append(k)
except Exception:
self.output_checklist['invalid'][y] = [k]
else:
try:
self.output_checklist['valid'][y].append(k)
except Exception:
self.output_checklist['valid'][y] = [k]
return

def check_output_variables(self):
for k in self.output_checklist['invalid']:
if k not in self.output_checklist['valid']:
raise DBError(f"Cannot find variable ``{k}`` in module ``{', '.join(self.output_checklist['invalid'][k])}``")
# check if the variable is in the same group
# eg, {'valid': {'alpha': ['elastic_net'], 'beta': ['ridge', 'elastic_net']}, 'invalid': {'alpha': ['ridge']}}
# is okay because of group {'fit': ['ridge', 'elastic_net']}
for i in self.output_checklist['invalid'][k]:
is_valid = []
for j in self.output_checklist['valid'][k]:
is_valid.extend([set([i,j]).issubset(set(s)) for g,s in self.groups.items()])
if not any(is_valid):
raise DBError(f"Cannot find variable ``{k}`` in module ``{i}``")
return


@staticmethod
def get_grouped_tables(groups):
'''
Expand Down
Loading

0 comments on commit c891e7c

Please sign in to comment.