From e5d1a6dc8b700b705ffaad529fd9a5daab834277 Mon Sep 17 00:00:00 2001
From: Vince Buffalo <vsbuffalo@gmail.com>
Date: Tue, 5 Sep 2023 12:17:23 -0700
Subject: [PATCH] new pull --all features, fixed bug with downloads not being
 done if dir structure didn't exist

---
 README.md                 | 22 ++++++++++++++--
 src/lib/api/figshare.rs   |  7 +++--
 src/lib/api/zenodo.rs     |  7 +++--
 src/lib/data.rs           | 36 ++++++++++++++++++++++++--
 src/lib/download.rs       | 54 +++++++++++++++++++++++++++++----------
 src/lib/project.rs        | 16 +++++++++---
 src/lib/test_utilities.rs |  2 +-
 src/lib/utils.rs          |  8 +++---
 src/main.rs               | 24 ++++++++++++++---
 9 files changed, 139 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index a96cd95..a447a5b 100644
--- a/README.md
+++ b/README.md
@@ -245,13 +245,13 @@ cds	https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cds/Homo_sapiens.
 Note that this has a header, and the URLs are in the second column. To get this data, we'd use: 
 
 ```console
-$ sdf bulk human_annotation.tsv --column 1 --header
+$ sdf bulk human_annotation.tsv --column 2 --header
 ⠁ [                                        ] 0/2 (0%) eta 00:00:00
 ⠉ [====>                                   ] 9071693/78889691 (11%) eta 00:01:22
 ⠐ [=========>                              ] 13503693/54514783 (25%) eta 00:00:35
 ```
 
-**Columns indices are zero-indexed** and `sdf bulk` assumes no headers by
+**Columns indices are one-indexed** and `sdf bulk` assumes no headers by
 default. Note that in this example, only two files are downloading — this is
 because `sdf` detected the CDS file already existed. SciDataFlow tells you this
 with a little message at the end: 
@@ -263,6 +263,24 @@ $ sdf bulk human_annotation.tsv --column 1 --header
 1 files were skipped because they existed (and --overwrite was no specified).
 ```
 
+Note that one can also download files from URLs that are in the Data Manifest.
+Suppose that you clone a repository that has no remotes, but each file entry
+has a URL set. Those can be retrieved with:
+
+```console
+$ sdf pull --urls  # if you want to overwrite any local files, use --ovewrite
+```
+
+These may or may not be `tracked`; tracking only indicates whether to *also*
+manage them with a remote like Zenodo or FigShare. In cases where the data file
+can be reliable retrieved from a steady source (e.g. a website like the UCSC
+Genome Browser or Ensembl) you may not want to duplicate it by also tracking
+it. If you want to pull in *everything*, use:
+
+```console
+$ sdf pull --all
+```
+
 ## Adding Metadata
 
 Some data repository services like Zenodo allow data depositions to be
diff --git a/src/lib/api/figshare.rs b/src/lib/api/figshare.rs
index 96b548b..4947e9c 100644
--- a/src/lib/api/figshare.rs
+++ b/src/lib/api/figshare.rs
@@ -379,13 +379,12 @@ impl FigShareAPI {
         let matches_found: Vec<_> = articles.into_iter().filter(|a| a.title == self.name).collect();
         if !matches_found.is_empty() {
             if matches_found.len() > 1 {
-                return Err(anyhow!("Found multiple FigShare Articles with the \
-                                   title '{}'", self.name));
+                Err(anyhow!("Found multiple FigShare Articles with the title '{}'", self.name))
             } else {
-                return Ok(Some(matches_found[0].clone()));
+                Ok(Some(matches_found[0].clone()))
             }
         } else {
-            return Ok(None);
+            Ok(None)
         }
     }
 
diff --git a/src/lib/api/zenodo.rs b/src/lib/api/zenodo.rs
index 2356f22..9fca56f 100644
--- a/src/lib/api/zenodo.rs
+++ b/src/lib/api/zenodo.rs
@@ -268,8 +268,7 @@ impl ZenodoAPI {
         let mut matches_found: Vec<_> = depositions.into_iter().filter(|a| a.title == self.name).collect();
         if !matches_found.is_empty() {
             if matches_found.len() > 1 {
-                return Err(anyhow!("Found multiple Zenodo Depositions with the \
-                                   title '{}'", self.name));
+                Err(anyhow!("Found multiple Zenodo Depositions with the title '{}'", self.name))
             } else {
                 // We need to do one more API call, to get the full listing
                 // with the bucket URL.
@@ -277,10 +276,10 @@ impl ZenodoAPI {
                 let url = format!("deposit/depositions/{}", partial_deposition.id);
                 let response = self.issue_request::<HashMap<String,String>>(Method::GET, &url, None, None).await?;
                 let deposition: ZenodoDeposition = response.json().await?;
-                return Ok(Some(deposition));
+                Ok(Some(deposition))
             }
         } else {
-            return Ok(None);
+            Ok(None)
         }
     }
 
diff --git a/src/lib/data.rs b/src/lib/data.rs
index e7afaac..1ee1148 100644
--- a/src/lib/data.rs
+++ b/src/lib/data.rs
@@ -655,7 +655,7 @@ impl DataCollection {
             Some(data_file) => {
                 // check that the file isn't empty 
                 // (this is why a path_context is needed)
-                let file_size = data_file.get_size(&path_context)?;
+                let file_size = data_file.get_size(path_context)?;
                 if file_size == 0 {
                     return Err(anyhow!("Cannot track an empty file, and '{}' has a file size of 0.", filepath));
                 }
@@ -938,6 +938,38 @@ impl DataCollection {
         Ok(())
     }
 
+    pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool) -> Result<()> {
+        let mut downloads = Downloads::new();
+        let mut filepaths = Vec::new();
+        let mut skipped = Vec::new();
+        let mut num_downloaded = 0;
+        for data_file in self.files.values() {
+            if let Some(url) = &data_file.url {
+                let full_path = data_file.full_path(path_context)?;
+                let download = downloads.add(url.clone(), Some(&full_path.to_string_lossy()), overwrite)?;
+                if let Some(dl) = download {
+                    let filepath = dl.filename.clone();
+                    filepaths.push(filepath);
+                    num_downloaded += 1;
+                } else {
+                    skipped.push(url.clone());
+                }
+            }
+        }
+
+        if num_downloaded > 0 {
+            println!("Downloaded:");
+        }
+        // grab all the files
+        downloads.retrieve(Some(" - {}"), None, false).await?;
+
+        let num_skipped = skipped.len();
+        println!("{} files were downloaded.\n\
+                  {} files were skipped because they existed (and --overwrite was not specified).",
+                  num_downloaded, num_skipped);
+        Ok(())
+    }
+
     // Download all files
     //
     // TODO: code redundancy with the push method's tracking of
@@ -1000,7 +1032,7 @@ impl DataCollection {
                 if do_download { 
                     if let Some(remote) = self.remotes.get(dir) {
                         let download = remote.get_download_info(merged_file, path_context, overwrite)?;
-                        downloads.list.push(download);
+                        downloads.queue.push(download);
                     }
                 }
             }
diff --git a/src/lib/download.rs b/src/lib/download.rs
index e4cc209..7dd4e7f 100644
--- a/src/lib/download.rs
+++ b/src/lib/download.rs
@@ -1,4 +1,5 @@
 use anyhow::{anyhow,Result,Context};
+use std::fs;
 use std::path::PathBuf;
 use reqwest::Url;
 
@@ -9,10 +10,9 @@ use crate::lib::progress::{DEFAULT_PROGRESS_STYLE, DEFAULT_PROGRESS_INC};
 use crate::lib::utils::pluralize;
 
 pub struct Downloads {
-    pub list: Vec<Download>,
+    pub queue: Vec<Download>,
 }
 
-
 pub trait Downloadable {
     fn to_url(self) -> Result<Url>;
 }
@@ -30,10 +30,16 @@ impl Downloadable for Url {
     }
 }
 
+impl Default for Downloads {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl Downloads {
     pub fn new() -> Self {
-        let list = Vec::new();
-        Downloads { list }
+        let queue = Vec::new();
+        Downloads { queue }
     }
 
     pub fn add<T: Downloadable>(&mut self, item: T, filename: Option<&str>,
@@ -55,10 +61,10 @@ impl Downloads {
         if file_path.exists() && !overwrite {
             return Ok(None);
         }
- 
+
         let download = Download { url, filename: resolved_filename };
-        self.list.push(download);
-        Ok(Some(self.list.last().ok_or(anyhow::anyhow!("Failed to add download"))?))
+        self.queue.push(download);
+        Ok(Some(self.queue.last().ok_or(anyhow::anyhow!("Failed to add download"))?))
     }
 
     pub fn default_style(&self) -> Result<StyleOptions> {
@@ -72,17 +78,41 @@ impl Downloads {
     }
 
 
+    // Retrieve all files in the download queue. 
+    //
+    // Note: if the file is in the queue, at this point it is considered *overwrite safe*.
+    // This is because overwrite-safety is checked at Downloads::add(), per-file.
+    // The trauma crate does not overwrite files; delete must be done manually here 
+    // first if it exists.
     pub async fn retrieve(&self, 
                           success_status: Option<&str>, 
                           no_downloads_message: Option<&str>,
                           show_total: bool) -> Result<()> {
-        let downloads = &self.list;
+        let downloads = &self.queue;
         let total_files = downloads.len();
         if !downloads.is_empty() { 
+
+            // Let's handle the file operations:
+            // 1) Delete the files if they exist, since if it's in the queue, it's 
+            //    overwrite-safe.
+            // 2) Create the directory structure if it does not exist.
+            for file in downloads {
+                let path = PathBuf::from(&file.filename);
+                if path.exists() {
+                    fs::remove_file(&path)?;
+                }
+
+                if let Some(parent_dir) = path.parent() {
+                    if !parent_dir.exists() {
+                        fs::create_dir_all(parent_dir)?;
+                    }
+                }
+            }
+
             let downloader = DownloaderBuilder::new()
                 .style_options(self.default_style()?)
                 .build();
-            downloader.download(&downloads).await;
+            downloader.download(downloads).await;
             if show_total {
                 let punc = if total_files > 0 { "." } else { ":" };
                 println!("Downloaded {}{}", pluralize(total_files as u64, "file"), punc);
@@ -95,10 +125,8 @@ impl Downloads {
                     println!("{}", msg.replace("{}", &name_str.to_string_lossy()));
                 }
             }
-        } else {
-            if no_downloads_message.is_some() {
-                println!("{}", no_downloads_message.unwrap_or(""));
-            }
+        } else if no_downloads_message.is_some() {
+            println!("{}", no_downloads_message.unwrap_or(""));
         }
         Ok(())
     }
diff --git a/src/lib/project.rs b/src/lib/project.rs
index 03c321d..1813975 100644
--- a/src/lib/project.rs
+++ b/src/lib/project.rs
@@ -434,7 +434,8 @@ impl Project {
             .has_headers(header)
             .from_reader(file);
 
-        let column = column.unwrap_or(0) as usize;
+        // convert 0-indexed to 1; first column is default 
+        let column = column.unwrap_or(0) as usize - 1;
 
         let mut downloads = Downloads::new();
         let mut filepaths = Vec::new();
@@ -495,8 +496,17 @@ impl Project {
         self.save()
     }
 
-    pub async fn pull(&mut self, overwrite: bool) -> Result<()> {
-        self.data.pull(&self.path_context(), overwrite).await }
+    pub async fn pull(&mut self, overwrite: bool, url: bool, all: bool) -> Result<()> {
+        let path_context = self.path_context();
+        if all {
+            self.data.pull_urls(&path_context, overwrite).await?;
+            return self.data.pull(&path_context, overwrite).await;
+        }
+        if url {
+            return self.data.pull_urls(&path_context, overwrite).await;
+        }
+        self.data.pull(&path_context, overwrite).await
+    }
 
     pub async fn push(&mut self, overwrite: bool) -> Result<()> {
         self.data.push(&self.path_context(), overwrite).await
diff --git a/src/lib/test_utilities.rs b/src/lib/test_utilities.rs
index f60aea7..f9af536 100644
--- a/src/lib/test_utilities.rs
+++ b/src/lib/test_utilities.rs
@@ -2,7 +2,7 @@ use anyhow::Result;
 
 pub fn check_error<T>(result: Result<T>, pattern: &str) {
     match result {
-        Ok(_) => assert!(false, "Expected an error, but got Ok"),
+        Ok(_) => panic!("Expected an error, but got Ok"),
         Err(err) => {
             assert!(err.to_string().contains(pattern),
             "Unexpected error {:?} containing pattern \"{:?}\" ", 
diff --git a/src/lib/utils.rs b/src/lib/utils.rs
index 6ed2a35..b8e4957 100644
--- a/src/lib/utils.rs
+++ b/src/lib/utils.rs
@@ -265,11 +265,11 @@ pub fn print_status(rows: BTreeMap<String,Vec<StatusEntry>>, remote: Option<&Has
     println!("{}", "Project data status:".bold());
     let counts = get_counts(&rows).expect("Internal Error: get_counts() panicked.");
     println!("{} local and tracked by a remote ({} only local, {} only remote), {} total.\n", 
-             pluralize(counts.both as u64, "file"),
-             pluralize(counts.local as u64, "file"),
-             pluralize(counts.remote as u64, "file"),
+             pluralize(counts.both, "file"),
+             pluralize(counts.local, "file"),
+             pluralize(counts.remote, "file"),
              //pluralize(counts.messy as u64, "file"),
-             pluralize(counts.total as u64, "file"));
+             pluralize(counts.total, "file"));
 
     // this brings the remote name (if there is a corresponding remote) into 
     // the key, so the linked remote can be displayed in the status 
diff --git a/src/main.rs b/src/main.rs
index 60d748f..e36cfc2 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -33,8 +33,14 @@ Some examples:
   Pull in data (you may want --overwrite):
   $ sdf pull
 
+  Pull in data from the URLs in the manifest only (you may want --overwrite)
+  $ sdf pull --url
+
+  Pull in data from URLs and remotes
+  $ sdf pull --all
+ 
   Push data to a remote (you may want --overwrite):
-  $ sdf pull
+  $ sdf push
 
   Download a file from a URL and register it in the Data Manifest:
   $ sdf get https://ftp.ensembl.org/some/path/to/large/data.fa.gz
@@ -186,12 +192,22 @@ enum Commands {
     },
 
     #[structopt(name = "pull")]
-    /// Pull in all tracked files from the remote.
+    /// Pull in all tracked files from the remote. If --urls is set,
+    /// this will (re)-download all files (tracked or not) in that manifest
+    /// from their URLs.
     Pull {
         /// Overwrite local files if they exit.
         #[structopt(long)]
         overwrite: bool,
 
+        /// Pull in files from the URLs, not remotes.
+        #[structopt(long)]
+        urls: bool,
+
+        /// Pull in files from remotes and URLs.
+        #[structopt(long)]
+        all: bool,
+
         // multiple optional directories
         //directories: Vec<PathBuf>,
     },
@@ -289,9 +305,9 @@ async fn run() -> Result<()> {
             let mut proj = Project::new()?;
             proj.push(*overwrite).await
         },
-        Some(Commands::Pull { overwrite }) => {
+        Some(Commands::Pull { overwrite, urls, all }) => {
             let mut proj = Project::new()?;
-            proj.pull(*overwrite).await
+            proj.pull(*overwrite, *urls, *all).await
         },
         Some(Commands::Metadata { title, description }) => {
             let mut proj = Project::new()?;