diff --git a/Cargo.lock b/Cargo.lock index 1e7b5d6..2b69ffe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2015,7 +2015,7 @@ dependencies = [ [[package]] name = "scidataflow" -version = "0.8.2" +version = "0.8.4" dependencies = [ "anyhow", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 1714f2a..35287e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scidataflow" -version = "0.8.3" +version = "0.8.4" edition = "2021" exclude = ["logo.png", "tests/test_data/**"] license = "MIT" diff --git a/README.md b/README.md index b62bbca..32cc1cd 100644 --- a/README.md +++ b/README.md @@ -306,6 +306,23 @@ kept in the Data Manifest. You can set this manually with: $ sdf metadata --title "genomics_analysis" --description "A re-analysis of Joan's data." ``` +## SciDataFlow Assets + +Good scientific workflows should create shareable **Scientific Assets** that +are *trivial* to download and build upon in your own scientific work. +SciDataFlow makes this possible, since in essence each `data_manifest.yml` file +is like a minimal recipe specification for also how to *retrieve* data. The +`sdf asset` command simply downloads a `data_manifest.yml` from +SciDataFlow-Assets, another GitHub repository, or URL. After this is +downloaded, all files can be retrieved in one line: + + $ sdf asset nygc_gatk_1000G_highcov + $ sdf pull --all + +The idea of SciDataFlow-Assets is to have a open, user-curated collection of +these recipes at https://github.com/scidataflow-assets. Please contribute +an Asset when you release new data with a paper! + ## SciDataFlow's Vision The larger vision of SciDataFlow is to change how data flows through scientific diff --git a/src/lib.rs b/src/lib.rs index 187fbe8..17fefee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ pub mod lib { pub mod macros; pub mod remote; pub mod utils; + pub mod assets; pub mod test_utilities; } diff --git a/src/main.rs b/src/main.rs index 6eeb0a8..6fb159e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,11 @@ +use std::path::Path; + use clap::{Parser, Subcommand}; -use anyhow::Result; +use anyhow::{Result, anyhow}; #[allow(unused_imports)] use log::{info, trace, debug}; +use scidataflow::lib::assets::GitHubRepo; +use scidataflow::lib::download::Downloads; use tokio::runtime::Builder; @@ -10,6 +14,8 @@ use scidataflow::logging_setup::setup; pub mod logging_setup; +const SDF_ASSET_URL: &str = "https://github.com/scidataflow-assets"; + const INFO: &str = "\ SciDataFlow: Manage and Share Scientific Data usage: sdf [--help] @@ -142,7 +148,17 @@ enum Commands { #[arg(required = true)] filenames: Vec, }, - + /// Retrieve a SciDataFlow Asset + Asset { + /// A GitHub link + #[arg(long)] + github: Option, + /// A URL to a data_manifest.yml file + #[arg(long)] + url: Option, + /// A SciDataFlow Asset name + asset: Option + }, /// Link a directory to a remote storage solution. Link { /// Directory to link to remote storage. @@ -310,6 +326,33 @@ async fn run() -> Result<()> { let mut proj = Project::new()?; proj.set_metadata(title, description) }, + Some(Commands::Asset { github, url, asset }) => { + if Path::new("data_manifest.yml").exists() { + return Err(anyhow!("data_manifest.yml already exists in the current directory; delete it manually first to use sdf asset.")); + } + let msg = "Set either --github, --url, or specify an SciDataFlow Asset name."; + let url = match (github, url, asset) { + (Some(gh), None, None) => { + let gh = GitHubRepo::new(&gh).map_err(|e| { + anyhow!("GitHubRepo initialization failed: {}", e) + })?; + gh.url("data_manifest.yml") + }, + (None, None, Some(asset)) => { + let url = format!("{}/{}", SDF_ASSET_URL, asset); + let gh = GitHubRepo::new(&url).expect("Internal Error: invalid Asset URL; please report."); + gh.url("data_manifest.yml") + }, + (None, Some(url), None) => { + url.to_string() + }, + _ => return Err(anyhow!(msg)) + }; + let mut downloads = Downloads::new(); + downloads.add(url.clone(), None, false)?; + downloads.retrieve(None, None, false).await?; + Ok(()) + }, None => { println!("{}\n", INFO); std::process::exit(1);