From e5d1a6dc8b700b705ffaad529fd9a5daab834277 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 5 Sep 2023 12:17:23 -0700 Subject: [PATCH] new pull --all features, fixed bug with downloads not being done if dir structure didn't exist --- README.md | 22 ++++++++++++++-- src/lib/api/figshare.rs | 7 +++-- src/lib/api/zenodo.rs | 7 +++-- src/lib/data.rs | 36 ++++++++++++++++++++++++-- src/lib/download.rs | 54 +++++++++++++++++++++++++++++---------- src/lib/project.rs | 16 +++++++++--- src/lib/test_utilities.rs | 2 +- src/lib/utils.rs | 8 +++--- src/main.rs | 24 ++++++++++++++--- 9 files changed, 139 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index a96cd95..a447a5b 100644 --- a/README.md +++ b/README.md @@ -245,13 +245,13 @@ cds https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cds/Homo_sapiens. Note that this has a header, and the URLs are in the second column. To get this data, we'd use: ```console -$ sdf bulk human_annotation.tsv --column 1 --header +$ sdf bulk human_annotation.tsv --column 2 --header ⠁ [ ] 0/2 (0%) eta 00:00:00 ⠉ [====> ] 9071693/78889691 (11%) eta 00:01:22 ⠐ [=========> ] 13503693/54514783 (25%) eta 00:00:35 ``` -**Columns indices are zero-indexed** and `sdf bulk` assumes no headers by +**Columns indices are one-indexed** and `sdf bulk` assumes no headers by default. Note that in this example, only two files are downloading — this is because `sdf` detected the CDS file already existed. SciDataFlow tells you this with a little message at the end: @@ -263,6 +263,24 @@ $ sdf bulk human_annotation.tsv --column 1 --header 1 files were skipped because they existed (and --overwrite was no specified). ``` +Note that one can also download files from URLs that are in the Data Manifest. +Suppose that you clone a repository that has no remotes, but each file entry +has a URL set. Those can be retrieved with: + +```console +$ sdf pull --urls # if you want to overwrite any local files, use --ovewrite +``` + +These may or may not be `tracked`; tracking only indicates whether to *also* +manage them with a remote like Zenodo or FigShare. In cases where the data file +can be reliable retrieved from a steady source (e.g. a website like the UCSC +Genome Browser or Ensembl) you may not want to duplicate it by also tracking +it. If you want to pull in *everything*, use: + +```console +$ sdf pull --all +``` + ## Adding Metadata Some data repository services like Zenodo allow data depositions to be diff --git a/src/lib/api/figshare.rs b/src/lib/api/figshare.rs index 96b548b..4947e9c 100644 --- a/src/lib/api/figshare.rs +++ b/src/lib/api/figshare.rs @@ -379,13 +379,12 @@ impl FigShareAPI { let matches_found: Vec<_> = articles.into_iter().filter(|a| a.title == self.name).collect(); if !matches_found.is_empty() { if matches_found.len() > 1 { - return Err(anyhow!("Found multiple FigShare Articles with the \ - title '{}'", self.name)); + Err(anyhow!("Found multiple FigShare Articles with the title '{}'", self.name)) } else { - return Ok(Some(matches_found[0].clone())); + Ok(Some(matches_found[0].clone())) } } else { - return Ok(None); + Ok(None) } } diff --git a/src/lib/api/zenodo.rs b/src/lib/api/zenodo.rs index 2356f22..9fca56f 100644 --- a/src/lib/api/zenodo.rs +++ b/src/lib/api/zenodo.rs @@ -268,8 +268,7 @@ impl ZenodoAPI { let mut matches_found: Vec<_> = depositions.into_iter().filter(|a| a.title == self.name).collect(); if !matches_found.is_empty() { if matches_found.len() > 1 { - return Err(anyhow!("Found multiple Zenodo Depositions with the \ - title '{}'", self.name)); + Err(anyhow!("Found multiple Zenodo Depositions with the title '{}'", self.name)) } else { // We need to do one more API call, to get the full listing // with the bucket URL. @@ -277,10 +276,10 @@ impl ZenodoAPI { let url = format!("deposit/depositions/{}", partial_deposition.id); let response = self.issue_request::>(Method::GET, &url, None, None).await?; let deposition: ZenodoDeposition = response.json().await?; - return Ok(Some(deposition)); + Ok(Some(deposition)) } } else { - return Ok(None); + Ok(None) } } diff --git a/src/lib/data.rs b/src/lib/data.rs index e7afaac..1ee1148 100644 --- a/src/lib/data.rs +++ b/src/lib/data.rs @@ -655,7 +655,7 @@ impl DataCollection { Some(data_file) => { // check that the file isn't empty // (this is why a path_context is needed) - let file_size = data_file.get_size(&path_context)?; + let file_size = data_file.get_size(path_context)?; if file_size == 0 { return Err(anyhow!("Cannot track an empty file, and '{}' has a file size of 0.", filepath)); } @@ -938,6 +938,38 @@ impl DataCollection { Ok(()) } + pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool) -> Result<()> { + let mut downloads = Downloads::new(); + let mut filepaths = Vec::new(); + let mut skipped = Vec::new(); + let mut num_downloaded = 0; + for data_file in self.files.values() { + if let Some(url) = &data_file.url { + let full_path = data_file.full_path(path_context)?; + let download = downloads.add(url.clone(), Some(&full_path.to_string_lossy()), overwrite)?; + if let Some(dl) = download { + let filepath = dl.filename.clone(); + filepaths.push(filepath); + num_downloaded += 1; + } else { + skipped.push(url.clone()); + } + } + } + + if num_downloaded > 0 { + println!("Downloaded:"); + } + // grab all the files + downloads.retrieve(Some(" - {}"), None, false).await?; + + let num_skipped = skipped.len(); + println!("{} files were downloaded.\n\ + {} files were skipped because they existed (and --overwrite was not specified).", + num_downloaded, num_skipped); + Ok(()) + } + // Download all files // // TODO: code redundancy with the push method's tracking of @@ -1000,7 +1032,7 @@ impl DataCollection { if do_download { if let Some(remote) = self.remotes.get(dir) { let download = remote.get_download_info(merged_file, path_context, overwrite)?; - downloads.list.push(download); + downloads.queue.push(download); } } } diff --git a/src/lib/download.rs b/src/lib/download.rs index e4cc209..7dd4e7f 100644 --- a/src/lib/download.rs +++ b/src/lib/download.rs @@ -1,4 +1,5 @@ use anyhow::{anyhow,Result,Context}; +use std::fs; use std::path::PathBuf; use reqwest::Url; @@ -9,10 +10,9 @@ use crate::lib::progress::{DEFAULT_PROGRESS_STYLE, DEFAULT_PROGRESS_INC}; use crate::lib::utils::pluralize; pub struct Downloads { - pub list: Vec, + pub queue: Vec, } - pub trait Downloadable { fn to_url(self) -> Result; } @@ -30,10 +30,16 @@ impl Downloadable for Url { } } +impl Default for Downloads { + fn default() -> Self { + Self::new() + } +} + impl Downloads { pub fn new() -> Self { - let list = Vec::new(); - Downloads { list } + let queue = Vec::new(); + Downloads { queue } } pub fn add(&mut self, item: T, filename: Option<&str>, @@ -55,10 +61,10 @@ impl Downloads { if file_path.exists() && !overwrite { return Ok(None); } - + let download = Download { url, filename: resolved_filename }; - self.list.push(download); - Ok(Some(self.list.last().ok_or(anyhow::anyhow!("Failed to add download"))?)) + self.queue.push(download); + Ok(Some(self.queue.last().ok_or(anyhow::anyhow!("Failed to add download"))?)) } pub fn default_style(&self) -> Result { @@ -72,17 +78,41 @@ impl Downloads { } + // Retrieve all files in the download queue. + // + // Note: if the file is in the queue, at this point it is considered *overwrite safe*. + // This is because overwrite-safety is checked at Downloads::add(), per-file. + // The trauma crate does not overwrite files; delete must be done manually here + // first if it exists. pub async fn retrieve(&self, success_status: Option<&str>, no_downloads_message: Option<&str>, show_total: bool) -> Result<()> { - let downloads = &self.list; + let downloads = &self.queue; let total_files = downloads.len(); if !downloads.is_empty() { + + // Let's handle the file operations: + // 1) Delete the files if they exist, since if it's in the queue, it's + // overwrite-safe. + // 2) Create the directory structure if it does not exist. + for file in downloads { + let path = PathBuf::from(&file.filename); + if path.exists() { + fs::remove_file(&path)?; + } + + if let Some(parent_dir) = path.parent() { + if !parent_dir.exists() { + fs::create_dir_all(parent_dir)?; + } + } + } + let downloader = DownloaderBuilder::new() .style_options(self.default_style()?) .build(); - downloader.download(&downloads).await; + downloader.download(downloads).await; if show_total { let punc = if total_files > 0 { "." } else { ":" }; println!("Downloaded {}{}", pluralize(total_files as u64, "file"), punc); @@ -95,10 +125,8 @@ impl Downloads { println!("{}", msg.replace("{}", &name_str.to_string_lossy())); } } - } else { - if no_downloads_message.is_some() { - println!("{}", no_downloads_message.unwrap_or("")); - } + } else if no_downloads_message.is_some() { + println!("{}", no_downloads_message.unwrap_or("")); } Ok(()) } diff --git a/src/lib/project.rs b/src/lib/project.rs index 03c321d..1813975 100644 --- a/src/lib/project.rs +++ b/src/lib/project.rs @@ -434,7 +434,8 @@ impl Project { .has_headers(header) .from_reader(file); - let column = column.unwrap_or(0) as usize; + // convert 0-indexed to 1; first column is default + let column = column.unwrap_or(0) as usize - 1; let mut downloads = Downloads::new(); let mut filepaths = Vec::new(); @@ -495,8 +496,17 @@ impl Project { self.save() } - pub async fn pull(&mut self, overwrite: bool) -> Result<()> { - self.data.pull(&self.path_context(), overwrite).await } + pub async fn pull(&mut self, overwrite: bool, url: bool, all: bool) -> Result<()> { + let path_context = self.path_context(); + if all { + self.data.pull_urls(&path_context, overwrite).await?; + return self.data.pull(&path_context, overwrite).await; + } + if url { + return self.data.pull_urls(&path_context, overwrite).await; + } + self.data.pull(&path_context, overwrite).await + } pub async fn push(&mut self, overwrite: bool) -> Result<()> { self.data.push(&self.path_context(), overwrite).await diff --git a/src/lib/test_utilities.rs b/src/lib/test_utilities.rs index f60aea7..f9af536 100644 --- a/src/lib/test_utilities.rs +++ b/src/lib/test_utilities.rs @@ -2,7 +2,7 @@ use anyhow::Result; pub fn check_error(result: Result, pattern: &str) { match result { - Ok(_) => assert!(false, "Expected an error, but got Ok"), + Ok(_) => panic!("Expected an error, but got Ok"), Err(err) => { assert!(err.to_string().contains(pattern), "Unexpected error {:?} containing pattern \"{:?}\" ", diff --git a/src/lib/utils.rs b/src/lib/utils.rs index 6ed2a35..b8e4957 100644 --- a/src/lib/utils.rs +++ b/src/lib/utils.rs @@ -265,11 +265,11 @@ pub fn print_status(rows: BTreeMap>, remote: Option<&Has println!("{}", "Project data status:".bold()); let counts = get_counts(&rows).expect("Internal Error: get_counts() panicked."); println!("{} local and tracked by a remote ({} only local, {} only remote), {} total.\n", - pluralize(counts.both as u64, "file"), - pluralize(counts.local as u64, "file"), - pluralize(counts.remote as u64, "file"), + pluralize(counts.both, "file"), + pluralize(counts.local, "file"), + pluralize(counts.remote, "file"), //pluralize(counts.messy as u64, "file"), - pluralize(counts.total as u64, "file")); + pluralize(counts.total, "file")); // this brings the remote name (if there is a corresponding remote) into // the key, so the linked remote can be displayed in the status diff --git a/src/main.rs b/src/main.rs index 60d748f..e36cfc2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,8 +33,14 @@ Some examples: Pull in data (you may want --overwrite): $ sdf pull + Pull in data from the URLs in the manifest only (you may want --overwrite) + $ sdf pull --url + + Pull in data from URLs and remotes + $ sdf pull --all + Push data to a remote (you may want --overwrite): - $ sdf pull + $ sdf push Download a file from a URL and register it in the Data Manifest: $ sdf get https://ftp.ensembl.org/some/path/to/large/data.fa.gz @@ -186,12 +192,22 @@ enum Commands { }, #[structopt(name = "pull")] - /// Pull in all tracked files from the remote. + /// Pull in all tracked files from the remote. If --urls is set, + /// this will (re)-download all files (tracked or not) in that manifest + /// from their URLs. Pull { /// Overwrite local files if they exit. #[structopt(long)] overwrite: bool, + /// Pull in files from the URLs, not remotes. + #[structopt(long)] + urls: bool, + + /// Pull in files from remotes and URLs. + #[structopt(long)] + all: bool, + // multiple optional directories //directories: Vec, }, @@ -289,9 +305,9 @@ async fn run() -> Result<()> { let mut proj = Project::new()?; proj.push(*overwrite).await }, - Some(Commands::Pull { overwrite }) => { + Some(Commands::Pull { overwrite, urls, all }) => { let mut proj = Project::new()?; - proj.pull(*overwrite).await + proj.pull(*overwrite, *urls, *all).await }, Some(Commands::Metadata { title, description }) => { let mut proj = Project::new()?;