Skip to content

Commit

Permalink
new pull --all features, fixed bug with downloads not being done if d…
Browse files Browse the repository at this point in the history
…ir structure didn't exist
  • Loading branch information
vsbuffalo committed Sep 5, 2023
1 parent e4d0782 commit e5d1a6d
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 37 deletions.
22 changes: 20 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,13 @@ cds https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cds/Homo_sapiens.
Note that this has a header, and the URLs are in the second column. To get this data, we'd use:

```console
$ sdf bulk human_annotation.tsv --column 1 --header
$ sdf bulk human_annotation.tsv --column 2 --header
⠁ [ ] 0/2 (0%) eta 00:00:00
⠉ [====> ] 9071693/78889691 (11%) eta 00:01:22
⠐ [=========> ] 13503693/54514783 (25%) eta 00:00:35
```

**Columns indices are zero-indexed** and `sdf bulk` assumes no headers by
**Columns indices are one-indexed** and `sdf bulk` assumes no headers by
default. Note that in this example, only two files are downloading — this is
because `sdf` detected the CDS file already existed. SciDataFlow tells you this
with a little message at the end:
Expand All @@ -263,6 +263,24 @@ $ sdf bulk human_annotation.tsv --column 1 --header
1 files were skipped because they existed (and --overwrite was no specified).
```

Note that one can also download files from URLs that are in the Data Manifest.
Suppose that you clone a repository that has no remotes, but each file entry
has a URL set. Those can be retrieved with:

```console
$ sdf pull --urls # if you want to overwrite any local files, use --ovewrite
```

These may or may not be `tracked`; tracking only indicates whether to *also*
manage them with a remote like Zenodo or FigShare. In cases where the data file
can be reliable retrieved from a steady source (e.g. a website like the UCSC
Genome Browser or Ensembl) you may not want to duplicate it by also tracking
it. If you want to pull in *everything*, use:

```console
$ sdf pull --all
```

## Adding Metadata

Some data repository services like Zenodo allow data depositions to be
Expand Down
7 changes: 3 additions & 4 deletions src/lib/api/figshare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,13 +379,12 @@ impl FigShareAPI {
let matches_found: Vec<_> = articles.into_iter().filter(|a| a.title == self.name).collect();
if !matches_found.is_empty() {
if matches_found.len() > 1 {
return Err(anyhow!("Found multiple FigShare Articles with the \
title '{}'", self.name));
Err(anyhow!("Found multiple FigShare Articles with the title '{}'", self.name))
} else {
return Ok(Some(matches_found[0].clone()));
Ok(Some(matches_found[0].clone()))
}
} else {
return Ok(None);
Ok(None)
}
}

Expand Down
7 changes: 3 additions & 4 deletions src/lib/api/zenodo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,19 +268,18 @@ impl ZenodoAPI {
let mut matches_found: Vec<_> = depositions.into_iter().filter(|a| a.title == self.name).collect();
if !matches_found.is_empty() {
if matches_found.len() > 1 {
return Err(anyhow!("Found multiple Zenodo Depositions with the \
title '{}'", self.name));
Err(anyhow!("Found multiple Zenodo Depositions with the title '{}'", self.name))
} else {
// We need to do one more API call, to get the full listing
// with the bucket URL.
let partial_deposition = matches_found.remove(0);
let url = format!("deposit/depositions/{}", partial_deposition.id);
let response = self.issue_request::<HashMap<String,String>>(Method::GET, &url, None, None).await?;
let deposition: ZenodoDeposition = response.json().await?;
return Ok(Some(deposition));
Ok(Some(deposition))
}
} else {
return Ok(None);
Ok(None)
}
}

Expand Down
36 changes: 34 additions & 2 deletions src/lib/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ impl DataCollection {
Some(data_file) => {
// check that the file isn't empty
// (this is why a path_context is needed)
let file_size = data_file.get_size(&path_context)?;
let file_size = data_file.get_size(path_context)?;
if file_size == 0 {
return Err(anyhow!("Cannot track an empty file, and '{}' has a file size of 0.", filepath));
}
Expand Down Expand Up @@ -938,6 +938,38 @@ impl DataCollection {
Ok(())
}

pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool) -> Result<()> {
let mut downloads = Downloads::new();
let mut filepaths = Vec::new();
let mut skipped = Vec::new();
let mut num_downloaded = 0;
for data_file in self.files.values() {
if let Some(url) = &data_file.url {
let full_path = data_file.full_path(path_context)?;
let download = downloads.add(url.clone(), Some(&full_path.to_string_lossy()), overwrite)?;
if let Some(dl) = download {
let filepath = dl.filename.clone();
filepaths.push(filepath);
num_downloaded += 1;
} else {
skipped.push(url.clone());
}
}
}

if num_downloaded > 0 {
println!("Downloaded:");
}
// grab all the files
downloads.retrieve(Some(" - {}"), None, false).await?;

let num_skipped = skipped.len();
println!("{} files were downloaded.\n\
{} files were skipped because they existed (and --overwrite was not specified).",
num_downloaded, num_skipped);
Ok(())
}

// Download all files
//
// TODO: code redundancy with the push method's tracking of
Expand Down Expand Up @@ -1000,7 +1032,7 @@ impl DataCollection {
if do_download {
if let Some(remote) = self.remotes.get(dir) {
let download = remote.get_download_info(merged_file, path_context, overwrite)?;
downloads.list.push(download);
downloads.queue.push(download);
}
}
}
Expand Down
54 changes: 41 additions & 13 deletions src/lib/download.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use anyhow::{anyhow,Result,Context};
use std::fs;
use std::path::PathBuf;
use reqwest::Url;

Expand All @@ -9,10 +10,9 @@ use crate::lib::progress::{DEFAULT_PROGRESS_STYLE, DEFAULT_PROGRESS_INC};
use crate::lib::utils::pluralize;

pub struct Downloads {
pub list: Vec<Download>,
pub queue: Vec<Download>,
}


pub trait Downloadable {
fn to_url(self) -> Result<Url>;
}
Expand All @@ -30,10 +30,16 @@ impl Downloadable for Url {
}
}

impl Default for Downloads {
fn default() -> Self {
Self::new()
}
}

impl Downloads {
pub fn new() -> Self {
let list = Vec::new();
Downloads { list }
let queue = Vec::new();
Downloads { queue }
}

pub fn add<T: Downloadable>(&mut self, item: T, filename: Option<&str>,
Expand All @@ -55,10 +61,10 @@ impl Downloads {
if file_path.exists() && !overwrite {
return Ok(None);
}

let download = Download { url, filename: resolved_filename };
self.list.push(download);
Ok(Some(self.list.last().ok_or(anyhow::anyhow!("Failed to add download"))?))
self.queue.push(download);
Ok(Some(self.queue.last().ok_or(anyhow::anyhow!("Failed to add download"))?))
}

pub fn default_style(&self) -> Result<StyleOptions> {
Expand All @@ -72,17 +78,41 @@ impl Downloads {
}


// Retrieve all files in the download queue.
//
// Note: if the file is in the queue, at this point it is considered *overwrite safe*.
// This is because overwrite-safety is checked at Downloads::add(), per-file.
// The trauma crate does not overwrite files; delete must be done manually here
// first if it exists.
pub async fn retrieve(&self,
success_status: Option<&str>,
no_downloads_message: Option<&str>,
show_total: bool) -> Result<()> {
let downloads = &self.list;
let downloads = &self.queue;
let total_files = downloads.len();
if !downloads.is_empty() {

// Let's handle the file operations:
// 1) Delete the files if they exist, since if it's in the queue, it's
// overwrite-safe.
// 2) Create the directory structure if it does not exist.
for file in downloads {
let path = PathBuf::from(&file.filename);
if path.exists() {
fs::remove_file(&path)?;
}

if let Some(parent_dir) = path.parent() {
if !parent_dir.exists() {
fs::create_dir_all(parent_dir)?;
}
}
}

let downloader = DownloaderBuilder::new()
.style_options(self.default_style()?)
.build();
downloader.download(&downloads).await;
downloader.download(downloads).await;
if show_total {
let punc = if total_files > 0 { "." } else { ":" };
println!("Downloaded {}{}", pluralize(total_files as u64, "file"), punc);
Expand All @@ -95,10 +125,8 @@ impl Downloads {
println!("{}", msg.replace("{}", &name_str.to_string_lossy()));
}
}
} else {
if no_downloads_message.is_some() {
println!("{}", no_downloads_message.unwrap_or(""));
}
} else if no_downloads_message.is_some() {
println!("{}", no_downloads_message.unwrap_or(""));
}
Ok(())
}
Expand Down
16 changes: 13 additions & 3 deletions src/lib/project.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ impl Project {
.has_headers(header)
.from_reader(file);

let column = column.unwrap_or(0) as usize;
// convert 0-indexed to 1; first column is default
let column = column.unwrap_or(0) as usize - 1;

let mut downloads = Downloads::new();
let mut filepaths = Vec::new();
Expand Down Expand Up @@ -495,8 +496,17 @@ impl Project {
self.save()
}

pub async fn pull(&mut self, overwrite: bool) -> Result<()> {
self.data.pull(&self.path_context(), overwrite).await }
pub async fn pull(&mut self, overwrite: bool, url: bool, all: bool) -> Result<()> {
let path_context = self.path_context();
if all {
self.data.pull_urls(&path_context, overwrite).await?;
return self.data.pull(&path_context, overwrite).await;
}
if url {
return self.data.pull_urls(&path_context, overwrite).await;
}
self.data.pull(&path_context, overwrite).await
}

pub async fn push(&mut self, overwrite: bool) -> Result<()> {
self.data.push(&self.path_context(), overwrite).await
Expand Down
2 changes: 1 addition & 1 deletion src/lib/test_utilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use anyhow::Result;

pub fn check_error<T>(result: Result<T>, pattern: &str) {
match result {
Ok(_) => assert!(false, "Expected an error, but got Ok"),
Ok(_) => panic!("Expected an error, but got Ok"),
Err(err) => {
assert!(err.to_string().contains(pattern),
"Unexpected error {:?} containing pattern \"{:?}\" ",
Expand Down
8 changes: 4 additions & 4 deletions src/lib/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ pub fn print_status(rows: BTreeMap<String,Vec<StatusEntry>>, remote: Option<&Has
println!("{}", "Project data status:".bold());
let counts = get_counts(&rows).expect("Internal Error: get_counts() panicked.");
println!("{} local and tracked by a remote ({} only local, {} only remote), {} total.\n",
pluralize(counts.both as u64, "file"),
pluralize(counts.local as u64, "file"),
pluralize(counts.remote as u64, "file"),
pluralize(counts.both, "file"),
pluralize(counts.local, "file"),
pluralize(counts.remote, "file"),
//pluralize(counts.messy as u64, "file"),
pluralize(counts.total as u64, "file"));
pluralize(counts.total, "file"));

// this brings the remote name (if there is a corresponding remote) into
// the key, so the linked remote can be displayed in the status
Expand Down
24 changes: 20 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,14 @@ Some examples:
Pull in data (you may want --overwrite):
$ sdf pull
Pull in data from the URLs in the manifest only (you may want --overwrite)
$ sdf pull --url
Pull in data from URLs and remotes
$ sdf pull --all
Push data to a remote (you may want --overwrite):
$ sdf pull
$ sdf push
Download a file from a URL and register it in the Data Manifest:
$ sdf get https://ftp.ensembl.org/some/path/to/large/data.fa.gz
Expand Down Expand Up @@ -186,12 +192,22 @@ enum Commands {
},

#[structopt(name = "pull")]
/// Pull in all tracked files from the remote.
/// Pull in all tracked files from the remote. If --urls is set,
/// this will (re)-download all files (tracked or not) in that manifest
/// from their URLs.
Pull {
/// Overwrite local files if they exit.
#[structopt(long)]
overwrite: bool,

/// Pull in files from the URLs, not remotes.
#[structopt(long)]
urls: bool,

/// Pull in files from remotes and URLs.
#[structopt(long)]
all: bool,

// multiple optional directories
//directories: Vec<PathBuf>,
},
Expand Down Expand Up @@ -289,9 +305,9 @@ async fn run() -> Result<()> {
let mut proj = Project::new()?;
proj.push(*overwrite).await
},
Some(Commands::Pull { overwrite }) => {
Some(Commands::Pull { overwrite, urls, all }) => {
let mut proj = Project::new()?;
proj.pull(*overwrite).await
proj.pull(*overwrite, *urls, *all).await
},
Some(Commands::Metadata { title, description }) => {
let mut proj = Project::new()?;
Expand Down

0 comments on commit e5d1a6d

Please sign in to comment.