From 2c4439f32986e03077e224369be9bb2b449c0181 Mon Sep 17 00:00:00 2001 From: Alexis Praga Date: Sat, 15 Jun 2024 16:58:16 +0200 Subject: [PATCH 1/3] Pull only a single file or a whole folder. This is set using -l or --limit flag. If a single file is given, it should be the full path (from the repo) otherwise it won't match. If a directory is given, match all files and subdirectories. If a subdirectory is given, it should have all parents from the repo root. Split the generation of a download queue in `pull` function into a separate function for readability. --- src/lib/data.rs | 99 +++++++++++++++++++++++++++++++++------------- src/lib/project.rs | 16 +++++--- src/main.rs | 11 ++++-- 3 files changed, 89 insertions(+), 37 deletions(-) diff --git a/src/lib/data.rs b/src/lib/data.rs index 41a0767..c6d8324 100644 --- a/src/lib/data.rs +++ b/src/lib/data.rs @@ -397,11 +397,11 @@ impl DataFile { pub fn directory(&self) -> Result { let path = std::path::Path::new(&self.path); Ok(path - .parent() - .unwrap_or(path) - .to_str() - .unwrap_or("") - .to_string()) + .parent() + .unwrap_or(path) + .to_str() + .unwrap_or("") + .to_string()) } pub async fn get_md5(&self, path_context: &Path) -> Result> { @@ -587,7 +587,7 @@ impl DataCollection { } else { Err(anyhow!( "File '{}' is already registered in the data manifest.\n\ - If you wish to update the MD5 or metadata, use: sdf update FILE", + If you wish to update the MD5 or metadata, use: sdf update FILE", &data_file.path )) } @@ -719,7 +719,7 @@ impl DataCollection { match data_file { None => Err(anyhow!( "Data file '{}' is not in the data manifest. Add it first using:\n \ - $ sdf track {}\n", + $ sdf track {}\n", filepath, filepath )), @@ -742,7 +742,7 @@ impl DataCollection { match data_file { None => Err(anyhow!( "Cannot untrack data file '{}' since it was never added to\ - the data manifest.", + the data manifest.", filepath )), Some(file) => file.set_untracked(), @@ -794,7 +794,7 @@ impl DataCollection { match result { Ok((key, value)) => { pb.bar - .set_message(format!("Fetching remote files... {} done.", key.0)); + .set_message(format!("Fetching remote files... {} done.", key.0)); all_remote_files.insert(key, value); pb.bar.inc(1); } @@ -913,7 +913,7 @@ impl DataCollection { while let Some(result) = statuses_futures.next().await { if let Ok((key, value)) = result { pb.bar - .set_message(format!("Calculating MD5s... {} done.", &value.name)); + .set_message(format!("Calculating MD5s... {} done.", &value.name)); statuses.entry(key).or_insert_with(Vec::new).push(value); pb.bar.inc(1); } else { @@ -1059,13 +1059,40 @@ impl DataCollection { Ok(()) } - pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool) -> Result<()> { + // Compare a local path `local` to a user request `request` (either a path or a directory) + // - if request is a path, the full paths of both must match exactly + // - otherwise request is a directory and one of the ancestor must match + // Exemple + // - "a/test.txt" would not match "test.text" (first case) + // - "a/b/c/test.txt" would match "a/b/c" or "a/b" (second case) + // - "a/b/c/test.txt" would not match "b/c" + fn match_user_file(local: &String, request: &Option) -> bool { + let p = PathBuf::from(local); + if let Some(req) = request { + p == *req || Self::has_common_ancestor(p, req) + } + else { + false + } + } + + // Common ancestor starting from root between a filepath and a directory + // - "a/b/c/test.txt" and "a/b/c" have one + // - "a/b/c/test.txt" and "b/c" and don't + fn has_common_ancestor(file: PathBuf, dir: &PathBuf) -> bool { + file.ancestors().filter(|x| !x.as_os_str().is_empty() && x == dir).count() > 0 + } + + pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool, limit: &Option) -> Result<()> { let mut downloads = Downloads::new(); let mut filepaths = Vec::new(); let mut skipped = Vec::new(); let mut num_downloaded = 0; for data_file in self.files.values() { if let Some(url) = &data_file.url { + if !Self::match_user_file(&data_file.path, limit) { + continue; + } let full_path = data_file.full_path(path_context)?; let download = downloads.add(url.clone(), Some(&full_path.to_string_lossy()), overwrite)?; @@ -1088,35 +1115,33 @@ impl DataCollection { let num_skipped = skipped.len(); println!( "{} files were downloaded.\n\ - {} files were skipped because they existed (and --overwrite was not specified).", + {} files were skipped because they existed (and --overwrite was not specified).", num_downloaded, num_skipped ); Ok(()) } - // Download all files - // - // TODO: code redundancy with the push method's tracking of - // why stuff is skipped; split out info enum, etc. - pub async fn pull(&mut self, path_context: &Path, overwrite: bool) -> Result<()> { - let all_files = self.merge(true).await?; - - let mut downloads = Downloads::new(); - - let mut current_skipped = Vec::new(); - let mut messy_skipped = Vec::new(); - let mut overwrite_skipped = Vec::new(); - + async fn pull_get_downloads(&mut self, all_files: &HashMap>, path_context: &Path, + overwrite: bool, + request: &Option, + current_skipped: &mut Vec, + messy_skipped: &mut Vec, + overwrite_skipped: &mut Vec, + downloads: &mut Downloads) -> Result<()> { for (dir, merged_files) in all_files.iter() { - // can_download() is true only if local and remote are not None. + // 1. Skip non matching files if needed + // 2. can_download() is true only if local and remote are not None. // (local file can be deleted, but will only be None if not in manifest also) - for merged_file in merged_files.values().filter(|f| f.can_download()) { + let filtered = merged_files.iter().filter(|(k,v)| Self::match_user_file(k, request) + && v.can_download()); + for (_, merged_file) in filtered { let path = merged_file.name()?; + println!("filtered {:?}", merged_file); let do_download = match merged_file.status(path_context).await? { RemoteStatusCode::NoLocal => { return Err(anyhow!("Internal error: execution should not have reached this point, please report.\n\ - 'sdf pull' filtered by MergedFile.can_download() but found a RemoteStatusCode::NoLocal status.")); + 'sdf pull' filtered by MergedFile.can_download() but found a RemoteStatusCode::NoLocal status.")); } RemoteStatusCode::Current => { current_skipped.push(path); @@ -1159,6 +1184,24 @@ impl DataCollection { } } } + Ok(()) + } + + // Download all files, or a set of matching files + // + // TODO: code redundancy with the push method's tracking of + // why stuff is skipped; split out info enum, etc. + pub async fn pull(&mut self, path_context: &Path, overwrite: bool, limit: &Option) -> Result<()> { + let all_files = self.merge(true).await?; + + let mut current_skipped = Vec::new(); + let mut messy_skipped = Vec::new(); + let mut overwrite_skipped = Vec::new(); + + let mut downloads = Downloads::new(); + self.pull_get_downloads(&all_files, path_context, overwrite, limit, + &mut current_skipped, &mut messy_skipped, + &mut overwrite_skipped, &mut downloads).await?; // now retrieve all the files in the queue. downloads diff --git a/src/lib/project.rs b/src/lib/project.rs index 3402f94..07eab2d 100644 --- a/src/lib/project.rs +++ b/src/lib/project.rs @@ -634,16 +634,22 @@ impl Project { self.save() } - pub async fn pull(&mut self, overwrite: bool, url: bool, all: bool) -> Result<()> { + pub async fn pull( + &mut self, + overwrite: bool, + url: bool, + all: bool, + limit: &Option, + ) -> Result<()> { let path_context = self.path_context(); if all { - self.data.pull_urls(&path_context, overwrite).await?; - return self.data.pull(&path_context, overwrite).await; + self.data.pull_urls(&path_context, overwrite, limit).await?; + return self.data.pull(&path_context, overwrite, limit).await; } if url { - return self.data.pull_urls(&path_context, overwrite).await; + return self.data.pull_urls(&path_context, overwrite, limit).await; } - self.data.pull(&path_context, overwrite).await + self.data.pull(&path_context, overwrite, limit).await } pub async fn push(&mut self, overwrite: bool) -> Result<()> { diff --git a/src/main.rs b/src/main.rs index 56c4ef2..ad8d952 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -use std::path::Path; +use std::path::{Path, PathBuf}; use anyhow::{anyhow, Result}; use clap::{Parser, Subcommand}; @@ -216,8 +216,10 @@ enum Commands { /// Pull in files from remotes and URLs. #[arg(long)] all: bool, - // multiple optional directories - //directories: Vec, + + /// Pull in matching file or directory + #[arg(short, long, value_name = "FILE/DIRECTORY")] + limit: Option, }, /// Change the project metadata. Metadata { @@ -343,9 +345,10 @@ async fn run() -> Result<()> { overwrite, urls, all, + limit, }) => { let mut proj = Project::new()?; - proj.pull(*overwrite, *urls, *all).await + proj.pull(*overwrite, *urls, *all, limit).await } Some(Commands::Metadata { title, description }) => { let mut proj = Project::new()?; From 6fe2feb23cb542018fc3292a7ce6b3ede945309f Mon Sep 17 00:00:00 2001 From: Alexis Praga Date: Sun, 16 Jun 2024 15:29:34 +0200 Subject: [PATCH 2/3] Running `carg fmt` for cleaner code --- src/lib/data.rs | 73 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/lib/data.rs b/src/lib/data.rs index c6d8324..63321b1 100644 --- a/src/lib/data.rs +++ b/src/lib/data.rs @@ -397,11 +397,11 @@ impl DataFile { pub fn directory(&self) -> Result { let path = std::path::Path::new(&self.path); Ok(path - .parent() - .unwrap_or(path) - .to_str() - .unwrap_or("") - .to_string()) + .parent() + .unwrap_or(path) + .to_str() + .unwrap_or("") + .to_string()) } pub async fn get_md5(&self, path_context: &Path) -> Result> { @@ -794,7 +794,7 @@ impl DataCollection { match result { Ok((key, value)) => { pb.bar - .set_message(format!("Fetching remote files... {} done.", key.0)); + .set_message(format!("Fetching remote files... {} done.", key.0)); all_remote_files.insert(key, value); pb.bar.inc(1); } @@ -913,7 +913,7 @@ impl DataCollection { while let Some(result) = statuses_futures.next().await { if let Ok((key, value)) = result { pb.bar - .set_message(format!("Calculating MD5s... {} done.", &value.name)); + .set_message(format!("Calculating MD5s... {} done.", &value.name)); statuses.entry(key).or_insert_with(Vec::new).push(value); pb.bar.inc(1); } else { @@ -1070,8 +1070,7 @@ impl DataCollection { let p = PathBuf::from(local); if let Some(req) = request { p == *req || Self::has_common_ancestor(p, req) - } - else { + } else { false } } @@ -1080,10 +1079,18 @@ impl DataCollection { // - "a/b/c/test.txt" and "a/b/c" have one // - "a/b/c/test.txt" and "b/c" and don't fn has_common_ancestor(file: PathBuf, dir: &PathBuf) -> bool { - file.ancestors().filter(|x| !x.as_os_str().is_empty() && x == dir).count() > 0 + file.ancestors() + .filter(|x| !x.as_os_str().is_empty() && x == dir) + .count() + > 0 } - pub async fn pull_urls(&mut self, path_context: &Path, overwrite: bool, limit: &Option) -> Result<()> { + pub async fn pull_urls( + &mut self, + path_context: &Path, + overwrite: bool, + limit: &Option, + ) -> Result<()> { let mut downloads = Downloads::new(); let mut filepaths = Vec::new(); let mut skipped = Vec::new(); @@ -1121,19 +1128,24 @@ impl DataCollection { Ok(()) } - async fn pull_get_downloads(&mut self, all_files: &HashMap>, path_context: &Path, - overwrite: bool, - request: &Option, - current_skipped: &mut Vec, - messy_skipped: &mut Vec, - overwrite_skipped: &mut Vec, - downloads: &mut Downloads) -> Result<()> { + async fn pull_get_downloads( + &mut self, + all_files: &HashMap>, + path_context: &Path, + overwrite: bool, + request: &Option, + current_skipped: &mut Vec, + messy_skipped: &mut Vec, + overwrite_skipped: &mut Vec, + downloads: &mut Downloads, + ) -> Result<()> { for (dir, merged_files) in all_files.iter() { // 1. Skip non matching files if needed // 2. can_download() is true only if local and remote are not None. // (local file can be deleted, but will only be None if not in manifest also) - let filtered = merged_files.iter().filter(|(k,v)| Self::match_user_file(k, request) - && v.can_download()); + let filtered = merged_files + .iter() + .filter(|(k, v)| Self::match_user_file(k, request) && v.can_download()); for (_, merged_file) in filtered { let path = merged_file.name()?; println!("filtered {:?}", merged_file); @@ -1191,7 +1203,12 @@ impl DataCollection { // // TODO: code redundancy with the push method's tracking of // why stuff is skipped; split out info enum, etc. - pub async fn pull(&mut self, path_context: &Path, overwrite: bool, limit: &Option) -> Result<()> { + pub async fn pull( + &mut self, + path_context: &Path, + overwrite: bool, + limit: &Option, + ) -> Result<()> { let all_files = self.merge(true).await?; let mut current_skipped = Vec::new(); @@ -1199,9 +1216,17 @@ impl DataCollection { let mut overwrite_skipped = Vec::new(); let mut downloads = Downloads::new(); - self.pull_get_downloads(&all_files, path_context, overwrite, limit, - &mut current_skipped, &mut messy_skipped, - &mut overwrite_skipped, &mut downloads).await?; + self.pull_get_downloads( + &all_files, + path_context, + overwrite, + limit, + &mut current_skipped, + &mut messy_skipped, + &mut overwrite_skipped, + &mut downloads, + ) + .await?; // now retrieve all the files in the queue. downloads From af0a06c31156790660fe6f71d283cc834f561943 Mon Sep 17 00:00:00 2001 From: Alexis Praga Date: Sun, 16 Jun 2024 16:12:40 +0200 Subject: [PATCH 3/3] Testing file comparison for partial pull (unit tests) --- src/lib/data.rs | 92 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/src/lib/data.rs b/src/lib/data.rs index 63321b1..05f5e88 100644 --- a/src/lib/data.rs +++ b/src/lib/data.rs @@ -1059,32 +1059,6 @@ impl DataCollection { Ok(()) } - // Compare a local path `local` to a user request `request` (either a path or a directory) - // - if request is a path, the full paths of both must match exactly - // - otherwise request is a directory and one of the ancestor must match - // Exemple - // - "a/test.txt" would not match "test.text" (first case) - // - "a/b/c/test.txt" would match "a/b/c" or "a/b" (second case) - // - "a/b/c/test.txt" would not match "b/c" - fn match_user_file(local: &String, request: &Option) -> bool { - let p = PathBuf::from(local); - if let Some(req) = request { - p == *req || Self::has_common_ancestor(p, req) - } else { - false - } - } - - // Common ancestor starting from root between a filepath and a directory - // - "a/b/c/test.txt" and "a/b/c" have one - // - "a/b/c/test.txt" and "b/c" and don't - fn has_common_ancestor(file: PathBuf, dir: &PathBuf) -> bool { - file.ancestors() - .filter(|x| !x.as_os_str().is_empty() && x == dir) - .count() - > 0 - } - pub async fn pull_urls( &mut self, path_context: &Path, @@ -1097,7 +1071,7 @@ impl DataCollection { let mut num_downloaded = 0; for data_file in self.files.values() { if let Some(url) = &data_file.url { - if !Self::match_user_file(&data_file.path, limit) { + if !match_user_file(&data_file.path, limit) { continue; } let full_path = data_file.full_path(path_context)?; @@ -1145,7 +1119,7 @@ impl DataCollection { // (local file can be deleted, but will only be None if not in manifest also) let filtered = merged_files .iter() - .filter(|(k, v)| Self::match_user_file(k, request) && v.can_download()); + .filter(|(k, v)| match_user_file(k, request) && v.can_download()); for (_, merged_file) in filtered { let path = merged_file.name()?; println!("filtered {:?}", merged_file); @@ -1266,18 +1240,52 @@ impl DataCollection { Ok(()) } + +} + + +// Common ancestor starting from root between a filepath and a directory +// - "a/b/c/test.txt" and "a/b/c" have one +// - "a/b/c/test.txt" and "b/c" and don't +fn has_common_ancestor(file: PathBuf, dir: &PathBuf) -> bool { + for x in file.ancestors() { + println!("ancestor for {:?} = {:?}", file, x); + + } + file.ancestors() + .filter(|x| !x.as_os_str().is_empty() && x == dir) + .count() + > 0 +} + +// Compare a local path `local` to a user request `request` (either a path or a directory) +// - if request is a path, the full paths of both must match exactly +// - otherwise request is a directory and one of the ancestor must match +// Exemple +// - "a/test.txt" would not match "test.text" (first case) +// - "a/b/c/test.txt" would match "a/b/c" or "a/b" (second case) +// - "a/b/c/test.txt" would not match "b/c" +fn match_user_file(local: &String, request: &Option) -> bool { + let p = PathBuf::from(local); + if let Some(req) = request { + p == *req || has_common_ancestor(p, req) + } else { + false + } } + #[cfg(test)] mod tests { use crate::lib::api::figshare::{FigShareAPI, FIGSHARE_BASE_URL}; use crate::lib::remote::Remote; use crate::lib::test_utilities::check_error; - use super::{DataCollection, DataFile}; + use super::{DataCollection, DataFile, match_user_file}; use std::io::Write; use std::path::Path; use tempfile::NamedTempFile; + use std::path::{PathBuf}; fn mock_data_file() -> NamedTempFile { let temp_file = NamedTempFile::new().unwrap(); @@ -1422,4 +1430,30 @@ mod tests { let result = dc.register_remote(&dir, Remote::FigShareAPI(figshare)); check_error(result, "already tracked"); } + + #[test] + fn test_common_ancestor() { + assert!( + match_user_file(&String::from("a/test.txt"), &Some(PathBuf::from("a"))), + "File and user directory should match" + ); + assert!( + match_user_file(&String::from("a/test.txt"), &Some(PathBuf::from("a/test.txt"))), + "File and user file should match" + ); + assert!( + !match_user_file(&String::from("a/test.txt"), &Some(PathBuf::from("b/test.txt"))), + "File and user file on directory should not match (different directory)" + ); + assert!( + !match_user_file(&String::from("a/test.txt"), &Some(PathBuf::from("a/test2.txt"))), + "File and user file on directory should not match (different user file)" + ); + assert!( + match_user_file(&String::from("a/b/c.txt"), &Some(PathBuf::from("a/b"))), + "File and user file on directory should not match (different user file)" + ); + + } + }