From d266de268f43684018aa6fff37da443d9bbf2816 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 26 Oct 2024 23:26:21 +0200 Subject: [PATCH 01/28] Move cache constants to top --- conda_lock/lookup.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 1f0e86787..f55b85bed 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -19,6 +19,12 @@ DEFAULT_MAPPING_URL = "https://raw.githubusercontent.com/regro/cf-graph-countyfair/master/mappings/pypi/grayskull_pypi_mapping.json" +CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days +"""Cached files older than this will be deleted.""" + +DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes +"""If the cached file is newer than this, just use it without checking for updates.""" + class MappingEntry(TypedDict): conda_name: str @@ -109,8 +115,6 @@ def cached_download_file(url: str) -> bytes: Protect against multiple processes downloading the same file. """ - CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days - DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes current_time = time.time() cache = user_cache_path("conda-lock", appauthor=False) cache.mkdir(parents=True, exist_ok=True) From 96b5efbd0037445663bf1114a55b961ea88cc06f Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 26 Oct 2024 23:34:27 +0200 Subject: [PATCH 02/28] Disambiguate canonicalize_name --- conda_lock/lookup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index f55b85bed..10caf4067 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -10,7 +10,7 @@ import requests from filelock import FileLock, Timeout -from packaging.utils import NormalizedName, canonicalize_name +from packaging.utils import NormalizedName from packaging.utils import canonicalize_name as canonicalize_pypi_name from platformdirs import user_cache_path @@ -57,9 +57,9 @@ def _get_pypi_lookup(mapping_url: str) -> Dict[NormalizedName, MappingEntry]: logger.debug(f"Loaded {len(lookup)} entries in {load_duration:.2f}s") # lowercase and kebabcase the pypi names assert lookup is not None - lookup = {canonicalize_name(k): v for k, v in lookup.items()} + lookup = {canonicalize_pypi_name(k): v for k, v in lookup.items()} for v in lookup.values(): - v["pypi_name"] = canonicalize_name(v["pypi_name"]) + v["pypi_name"] = canonicalize_pypi_name(v["pypi_name"]) return lookup @@ -102,7 +102,7 @@ def _get_conda_lookup(mapping_url: str) -> Dict[str, MappingEntry]: def conda_name_to_pypi_name(name: str, mapping_url: str) -> NormalizedName: """return the pypi name for a conda package""" lookup = _get_conda_lookup(mapping_url=mapping_url) - cname = canonicalize_name(name) + cname = canonicalize_pypi_name(name) return lookup.get(cname, {"pypi_name": cname})["pypi_name"] From 411d18bf774aef7582468f347e99dc13ddf98dcd Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 26 Oct 2024 23:41:12 +0200 Subject: [PATCH 03/28] Simplify pypi_name_to_conda_name --- conda_lock/lookup.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 10caf4067..74b6cb61a 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -74,18 +74,15 @@ def pypi_name_to_conda_name(name: str, mapping_url: str) -> str: 'zpfqzvrj' """ cname = canonicalize_pypi_name(name) - if cname in _get_pypi_lookup(mapping_url): - lookup = _get_pypi_lookup(mapping_url)[cname] - res = lookup.get("conda_name") or lookup.get("conda_forge") + lookup = _get_pypi_lookup(mapping_url) + if cname in lookup: + entry = lookup[cname] + res = entry.get("conda_name") or entry.get("conda_forge") if res is not None: return res - else: - logging.warning( - f"Could not find conda name for {cname}. Assuming identity." - ) - return cname - else: - return cname + + logger.warning(f"Could not find conda name for {cname}. Assuming identity.") + return cname @lru_cache(maxsize=None) From 3916a447fbeaf7f050f69f6b843db6fa94e54644 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 26 Oct 2024 23:46:16 +0200 Subject: [PATCH 04/28] Generate filename in helper function --- conda_lock/lookup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 74b6cb61a..e76f7dcff 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -125,8 +125,7 @@ def cached_download_file(url: str) -> bytes: logger.debug("Removing old cache file %s", file) file.unlink() - url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] - destination_mapping = cache / f"pypi-mapping-{url_hash}.yaml" + destination_mapping = cache / cached_filename_for_url(url) destination_etag = destination_mapping.with_suffix(".etag") destination_lock = destination_mapping.with_suffix(".lock") @@ -184,3 +183,9 @@ def cached_download_file(url: str) -> bytes: f"Failed to acquire lock on {destination_lock}, it is likely " f"being downloaded by another process. Retrying..." ) + + +def cached_filename_for_url(url: str) -> str: + url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] + extension = "yaml" if url.endswith(".yaml") else "json" + return f"pypi-mapping-{url_hash}.{extension}" From 04b5dbdc7fbdb91c1fa5532725990c3ec73a2a1b Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 26 Oct 2024 23:52:15 +0200 Subject: [PATCH 05/28] Use immediate time when checking age This makes refactoring easier --- conda_lock/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index e76f7dcff..deb3b1072 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -138,7 +138,7 @@ def cached_download_file(url: str) -> bytes: # Return the contents immediately if the file is fresh try: mtime = destination_mapping.stat().st_mtime - age = current_time - mtime + age = time.time() - mtime if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: contents = destination_mapping.read_bytes() logger.debug( From e354a58c33eff50057770dc3c762840bb53a7c37 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 00:05:53 +0200 Subject: [PATCH 06/28] Recompute destination filename later This helps refactoring --- conda_lock/lookup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index deb3b1072..32c347136 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -125,9 +125,7 @@ def cached_download_file(url: str) -> bytes: logger.debug("Removing old cache file %s", file) file.unlink() - destination_mapping = cache / cached_filename_for_url(url) - destination_etag = destination_mapping.with_suffix(".etag") - destination_lock = destination_mapping.with_suffix(".lock") + destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") # Wait for any other process to finish downloading the file. # Use the ETag to avoid downloading the file if it hasn't changed. @@ -135,6 +133,8 @@ def cached_download_file(url: str) -> bytes: while True: try: with FileLock(destination_lock, timeout=5): + destination_mapping = cache / cached_filename_for_url(url) + destination_etag = destination_mapping.with_suffix(".etag") # Return the contents immediately if the file is fresh try: mtime = destination_mapping.stat().st_mtime From 265846666f1eb6d684efcb97cf126cc8a9484be5 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 00:12:11 +0200 Subject: [PATCH 07/28] Split out download_to_or_read_from_cache --- conda_lock/lookup.py | 93 +++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 32c347136..f5737940e 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -133,51 +133,7 @@ def cached_download_file(url: str) -> bytes: while True: try: with FileLock(destination_lock, timeout=5): - destination_mapping = cache / cached_filename_for_url(url) - destination_etag = destination_mapping.with_suffix(".etag") - # Return the contents immediately if the file is fresh - try: - mtime = destination_mapping.stat().st_mtime - age = time.time() - mtime - if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: - contents = destination_mapping.read_bytes() - logger.debug( - f"Using cached mapping {destination_mapping} without " - f"checking for updates" - ) - return contents - except FileNotFoundError: - pass - # Get the ETag from the last download, if it exists - if destination_mapping.exists() and destination_etag.exists(): - logger.debug(f"Old ETag found at {destination_etag}") - try: - old_etag = destination_etag.read_text().strip() - headers = {"If-None-Match": old_etag} - except FileNotFoundError: - logger.warning("Failed to read ETag") - headers = {} - else: - headers = {} - # Download the file and cache the result. - logger.debug(f"Requesting {url}") - res = requests.get(url, headers=headers) - if res.status_code == 304: - logger.debug( - f"{url} has not changed since last download, " - f"using {destination_mapping}" - ) - else: - res.raise_for_status() - time.sleep(10) - destination_mapping.write_bytes(res.content) - if "ETag" in res.headers: - destination_etag.write_text(res.headers["ETag"]) - else: - logger.warning("No ETag in response headers") - logger.debug(f"Downloaded {url} to {destination_mapping}") - return destination_mapping.read_bytes() - + return download_to_or_read_from_cache(url, cache) except Timeout: logger.warning( f"Failed to acquire lock on {destination_lock}, it is likely " @@ -185,6 +141,53 @@ def cached_download_file(url: str) -> bytes: ) +def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: + destination_mapping = cache / cached_filename_for_url(url) + destination_etag = destination_mapping.with_suffix(".etag") + # Return the contents immediately if the file is fresh + try: + mtime = destination_mapping.stat().st_mtime + age = time.time() - mtime + if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: + contents = destination_mapping.read_bytes() + logger.debug( + f"Using cached mapping {destination_mapping} without " + f"checking for updates" + ) + return contents + except FileNotFoundError: + pass + # Get the ETag from the last download, if it exists + if destination_mapping.exists() and destination_etag.exists(): + logger.debug(f"Old ETag found at {destination_etag}") + try: + old_etag = destination_etag.read_text().strip() + headers = {"If-None-Match": old_etag} + except FileNotFoundError: + logger.warning("Failed to read ETag") + headers = {} + else: + headers = {} + # Download the file and cache the result. + logger.debug(f"Requesting {url}") + res = requests.get(url, headers=headers) + if res.status_code == 304: + logger.debug( + f"{url} has not changed since last download, " + f"using {destination_mapping}" + ) + else: + res.raise_for_status() + time.sleep(10) + destination_mapping.write_bytes(res.content) + if "ETag" in res.headers: + destination_etag.write_text(res.headers["ETag"]) + else: + logger.warning("No ETag in response headers") + logger.debug(f"Downloaded {url} to {destination_mapping}") + return destination_mapping.read_bytes() + + def cached_filename_for_url(url: str) -> str: url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] extension = "yaml" if url.endswith(".yaml") else "json" From d431851245b9ca6d7a5f6e9a6fec2f5f1ecf3a0b Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 00:14:49 +0200 Subject: [PATCH 08/28] Remove sleep statement This was accidentally left over from debugging --- conda_lock/lookup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index f5737940e..6906315a0 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -178,7 +178,6 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: ) else: res.raise_for_status() - time.sleep(10) destination_mapping.write_bytes(res.content) if "ETag" in res.headers: destination_etag.write_text(res.headers["ETag"]) From 344d899dc0f1ae2ad9d84c1540a8312883023e49 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 00:19:58 +0200 Subject: [PATCH 09/28] Simplify header logic --- conda_lock/lookup.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 6906315a0..d032ee908 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -144,33 +144,24 @@ def cached_download_file(url: str) -> bytes: def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: destination_mapping = cache / cached_filename_for_url(url) destination_etag = destination_mapping.with_suffix(".etag") + request_headers = {} # Return the contents immediately if the file is fresh - try: + if destination_mapping.is_file(): mtime = destination_mapping.stat().st_mtime age = time.time() - mtime if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: - contents = destination_mapping.read_bytes() logger.debug( f"Using cached mapping {destination_mapping} without " f"checking for updates" ) - return contents - except FileNotFoundError: - pass - # Get the ETag from the last download, if it exists - if destination_mapping.exists() and destination_etag.exists(): - logger.debug(f"Old ETag found at {destination_etag}") - try: + return destination_mapping.read_bytes() + # Get the ETag from the last download, if it exists + if destination_etag.is_file(): old_etag = destination_etag.read_text().strip() - headers = {"If-None-Match": old_etag} - except FileNotFoundError: - logger.warning("Failed to read ETag") - headers = {} - else: - headers = {} + request_headers["If-None-Match"] = old_etag # Download the file and cache the result. logger.debug(f"Requesting {url}") - res = requests.get(url, headers=headers) + res = requests.get(url, headers=request_headers) if res.status_code == 304: logger.debug( f"{url} has not changed since last download, " From f2837542699e8576c7461e1f957579c2aba6e5d7 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 00:23:47 +0200 Subject: [PATCH 10/28] =?UTF-8?q?Rename=20destination=5Fmapping=20?= =?UTF-8?q?=E2=86=92=20destination?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conda_lock/lookup.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index d032ee908..f55cc8a6d 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -142,19 +142,18 @@ def cached_download_file(url: str) -> bytes: def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: - destination_mapping = cache / cached_filename_for_url(url) - destination_etag = destination_mapping.with_suffix(".etag") + destination = cache / cached_filename_for_url(url) + destination_etag = destination.with_suffix(".etag") request_headers = {} # Return the contents immediately if the file is fresh - if destination_mapping.is_file(): - mtime = destination_mapping.stat().st_mtime + if destination.is_file(): + mtime = destination.stat().st_mtime age = time.time() - mtime if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: logger.debug( - f"Using cached mapping {destination_mapping} without " - f"checking for updates" + f"Using cached mapping {destination} without checking for updates" ) - return destination_mapping.read_bytes() + return destination.read_bytes() # Get the ETag from the last download, if it exists if destination_etag.is_file(): old_etag = destination_etag.read_text().strip() @@ -163,19 +162,16 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: logger.debug(f"Requesting {url}") res = requests.get(url, headers=request_headers) if res.status_code == 304: - logger.debug( - f"{url} has not changed since last download, " - f"using {destination_mapping}" - ) + logger.debug(f"{url} has not changed since last download, using {destination}") else: res.raise_for_status() - destination_mapping.write_bytes(res.content) + destination.write_bytes(res.content) if "ETag" in res.headers: destination_etag.write_text(res.headers["ETag"]) else: logger.warning("No ETag in response headers") - logger.debug(f"Downloaded {url} to {destination_mapping}") - return destination_mapping.read_bytes() + logger.debug(f"Downloaded {url} to {destination}") + return destination.read_bytes() def cached_filename_for_url(url: str) -> str: From 06b036411def37134a9ca65eab82596e19266cc2 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 08:52:09 +0100 Subject: [PATCH 11/28] Compute current time on-demand This simplifies the variable scope --- conda_lock/lookup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index f55cc8a6d..68020edc5 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -112,7 +112,6 @@ def cached_download_file(url: str) -> bytes: Protect against multiple processes downloading the same file. """ - current_time = time.time() cache = user_cache_path("conda-lock", appauthor=False) cache.mkdir(parents=True, exist_ok=True) @@ -120,7 +119,7 @@ def cached_download_file(url: str) -> bytes: for file in cache.iterdir(): if file.name.startswith("pypi-mapping-"): mtime = file.stat().st_mtime - age = current_time - mtime + age = time.time() - mtime if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS: logger.debug("Removing old cache file %s", file) file.unlink() From 3bf44ce392270190a41bc796a4b1df1e9df789f4 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 08:56:08 +0100 Subject: [PATCH 12/28] Eliminate cache prefix in favor of subdirectory --- conda_lock/lookup.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 68020edc5..e526e9209 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -112,17 +112,16 @@ def cached_download_file(url: str) -> bytes: Protect against multiple processes downloading the same file. """ - cache = user_cache_path("conda-lock", appauthor=False) + cache = user_cache_path("conda-lock", appauthor=False) / "pypi-mapping" cache.mkdir(parents=True, exist_ok=True) # clear out old cache files for file in cache.iterdir(): - if file.name.startswith("pypi-mapping-"): - mtime = file.stat().st_mtime - age = time.time() - mtime - if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS: - logger.debug("Removing old cache file %s", file) - file.unlink() + mtime = file.stat().st_mtime + age = time.time() - mtime + if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS: + logger.debug("Removing old cache file %s", file) + file.unlink() destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") @@ -176,4 +175,4 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: def cached_filename_for_url(url: str) -> str: url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] extension = "yaml" if url.endswith(".yaml") else "json" - return f"pypi-mapping-{url_hash}.{extension}" + return f"{url_hash}.{extension}" From f6d6ef8f0ab2b82a200398a3da355ebd1d5417e5 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 08:59:44 +0100 Subject: [PATCH 13/28] Add "cache" to the parent directory of cache directory as safety This way we can very easily check that the cache is a cache before we start deleting files. --- conda_lock/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index e526e9209..dc4880b13 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -112,7 +112,7 @@ def cached_download_file(url: str) -> bytes: Protect against multiple processes downloading the same file. """ - cache = user_cache_path("conda-lock", appauthor=False) / "pypi-mapping" + cache = user_cache_path("conda-lock", appauthor=False) / "cache" / "pypi-mapping" cache.mkdir(parents=True, exist_ok=True) # clear out old cache files From 2886a3f7e3502ccd20ed21704d58bf09d2473083 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:11:52 +0100 Subject: [PATCH 14/28] Split cache clearing functionality into a separate function --- conda_lock/lookup.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index dc4880b13..d2db90c1a 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -114,14 +114,7 @@ def cached_download_file(url: str) -> bytes: """ cache = user_cache_path("conda-lock", appauthor=False) / "cache" / "pypi-mapping" cache.mkdir(parents=True, exist_ok=True) - - # clear out old cache files - for file in cache.iterdir(): - mtime = file.stat().st_mtime - age = time.time() - mtime - if age < 0 or age > CLEAR_CACHE_AFTER_SECONDS: - logger.debug("Removing old cache file %s", file) - file.unlink() + clear_old_files_from_cache(cache, max_age=CLEAR_CACHE_AFTER_SECONDS) destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") @@ -176,3 +169,24 @@ def cached_filename_for_url(url: str) -> str: url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] extension = "yaml" if url.endswith(".yaml") else "json" return f"{url_hash}.{extension}" + + +def clear_old_files_from_cache(cache: Path, *, max_age: float) -> None: + """Remove files in the cache directory older than max_age seconds. + + Also removes any files that somehow have a modification time in the future. + + For safety, this raises an error if `cache` is not a subdirectory of + a directory named `"cache"`. + """ + if not cache.parent.name == "cache": + raise ValueError( + f"Expected cache directory, got {cache}. Parent should be 'cache' ", + f"not '{cache.parent.name}'", + ) + for file in cache.iterdir(): + mtime = file.stat().st_mtime + age = time.time() - mtime + if age < 0 or age > max_age: + logger.debug("Removing old cache file %s", file) + file.unlink() From 428a0972d8efac92dad2be8aacfc94d37d8d4c47 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:21:51 +0100 Subject: [PATCH 15/28] Update comments and docstrings --- conda_lock/lookup.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index d2db90c1a..22390b2e3 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -119,8 +119,8 @@ def cached_download_file(url: str) -> bytes: destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") # Wait for any other process to finish downloading the file. - # Use the ETag to avoid downloading the file if it hasn't changed. - # Otherwise, download the file and cache the contents and ETag. + # This way we can use the result from the current download without + # spawning multiple concurrent downloads. while True: try: with FileLock(destination_lock, timeout=5): @@ -133,6 +133,12 @@ def cached_download_file(url: str) -> bytes: def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: + """Download a file to the cache directory and return the contents. + + If the file is newer than DONT_CHECK_IF_NEWER_THAN_SECONDS, then immediately + return the cached contents. Otherwise we pass the ETag from the last download + in the headers to avoid downloading the file if it hasn't changed remotely. + """ destination = cache / cached_filename_for_url(url) destination_etag = destination.with_suffix(".etag") request_headers = {} @@ -145,7 +151,9 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: f"Using cached mapping {destination} without checking for updates" ) return destination.read_bytes() - # Get the ETag from the last download, if it exists + # Add the ETag from the last download, if it exists, to the headers. + # The ETag is used to avoid downloading the file if it hasn't changed remotely. + # Otherwise, download the file and cache the contents and ETag. if destination_etag.is_file(): old_etag = destination_etag.read_text().strip() request_headers["If-None-Match"] = old_etag @@ -166,6 +174,20 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: def cached_filename_for_url(url: str) -> str: + """Return a filename for a URL that is probably unique to the URL. + + The filename is a 4-character hash of the URL, followed by the extension. + If the extension is not alphanumeric or too long, it is omitted. + + >>> cached_filename_for_url("https://example.com/foo.json") + 'a5d7.json' + >>> cached_filename_for_url("https://example.com/foo") + '5ea6' + >>> cached_filename_for_url("https://example.com/foo.bär") + '2191' + >>> cached_filename_for_url("https://example.com/foo.baaaaaar") + '1861' + """ url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] extension = "yaml" if url.endswith(".yaml") else "json" return f"{url_hash}.{extension}" From e796ff7d84ca39f8b8d9762a2724ed6c211c9d74 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:28:33 +0100 Subject: [PATCH 16/28] Handle URL file extensions more robustly --- conda_lock/lookup.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 22390b2e3..ef391ca20 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -1,6 +1,7 @@ import hashlib import json import logging +import re import time from functools import lru_cache @@ -189,8 +190,11 @@ def cached_filename_for_url(url: str) -> str: '1861' """ url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] - extension = "yaml" if url.endswith(".yaml") else "json" - return f"{url_hash}.{extension}" + extension = url.split(".")[-1] + if len(extension) <= 6 and re.match("^[a-zA-Z0-9]+$", extension): + return f"{url_hash}.{extension}" + else: + return f"{url_hash}" def clear_old_files_from_cache(cache: Path, *, max_age: float) -> None: From 2e4ff24efa468416ef483e7699d2b183f45ee49d Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:42:32 +0100 Subject: [PATCH 17/28] Verify that age is nonnegative before using cached value --- conda_lock/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index ef391ca20..84b6f4dbc 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -147,7 +147,7 @@ def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: if destination.is_file(): mtime = destination.stat().st_mtime age = time.time() - mtime - if age < DONT_CHECK_IF_NEWER_THAN_SECONDS: + if 0 <= age <= DONT_CHECK_IF_NEWER_THAN_SECONDS: logger.debug( f"Using cached mapping {destination} without checking for updates" ) From 2b9710295586be4629f8a4b74eceb248de4f86a4 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:48:12 +0100 Subject: [PATCH 18/28] Refactor _download_to_or_read_from_cache --- conda_lock/lookup.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 84b6f4dbc..0b2cd0318 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -125,7 +125,11 @@ def cached_download_file(url: str) -> bytes: while True: try: with FileLock(destination_lock, timeout=5): - return download_to_or_read_from_cache(url, cache) + return _download_to_or_read_from_cache( + url, + cache=cache, + dont_check_if_newer_than_seconds=DONT_CHECK_IF_NEWER_THAN_SECONDS, + ) except Timeout: logger.warning( f"Failed to acquire lock on {destination_lock}, it is likely " @@ -133,10 +137,15 @@ def cached_download_file(url: str) -> bytes: ) -def download_to_or_read_from_cache(url: str, cache: Path) -> bytes: +def _download_to_or_read_from_cache( + url: str, *, cache: Path, dont_check_if_newer_than_seconds: float +) -> bytes: """Download a file to the cache directory and return the contents. - If the file is newer than DONT_CHECK_IF_NEWER_THAN_SECONDS, then immediately + This function is designed to be called from within a FileLock context to avoid + multiple processes downloading the same file. + + If the file is newer than `dont_check_if_newer_than_seconds`, then immediately return the cached contents. Otherwise we pass the ETag from the last download in the headers to avoid downloading the file if it hasn't changed remotely. """ From d7b1b167b4fb0b1282c0d8677f1bd5fd6bf5d50b Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 09:56:42 +0100 Subject: [PATCH 19/28] Add parameters to caching functions --- conda_lock/lookup.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 0b2cd0318..2a3525ed0 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -38,7 +38,7 @@ class MappingEntry(TypedDict): def _get_pypi_lookup(mapping_url: str) -> Dict[NormalizedName, MappingEntry]: url = mapping_url if url.startswith("http://") or url.startswith("https://"): - content = cached_download_file(url) + content = cached_download_file(url, cache_subdir_name="pypi-mapping") else: if url.startswith("file://"): path = url[len("file://") :] @@ -104,7 +104,13 @@ def conda_name_to_pypi_name(name: str, mapping_url: str) -> NormalizedName: return lookup.get(cname, {"pypi_name": cname})["pypi_name"] -def cached_download_file(url: str) -> bytes: +def cached_download_file( + url: str, + *, + cache_subdir_name: str, + max_age_seconds: float = CLEAR_CACHE_AFTER_SECONDS, + dont_check_if_newer_than_seconds: float = DONT_CHECK_IF_NEWER_THAN_SECONDS, +) -> bytes: """Download a file and cache it in the user cache directory. If the file is already cached, return the cached contents. @@ -113,9 +119,9 @@ def cached_download_file(url: str) -> bytes: Protect against multiple processes downloading the same file. """ - cache = user_cache_path("conda-lock", appauthor=False) / "cache" / "pypi-mapping" + cache = user_cache_path("conda-lock", appauthor=False) / "cache" / cache_subdir_name cache.mkdir(parents=True, exist_ok=True) - clear_old_files_from_cache(cache, max_age=CLEAR_CACHE_AFTER_SECONDS) + clear_old_files_from_cache(cache, max_age_seconds=max_age_seconds) destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") @@ -128,7 +134,7 @@ def cached_download_file(url: str) -> bytes: return _download_to_or_read_from_cache( url, cache=cache, - dont_check_if_newer_than_seconds=DONT_CHECK_IF_NEWER_THAN_SECONDS, + dont_check_if_newer_than_seconds=dont_check_if_newer_than_seconds, ) except Timeout: logger.warning( @@ -206,8 +212,8 @@ def cached_filename_for_url(url: str) -> str: return f"{url_hash}" -def clear_old_files_from_cache(cache: Path, *, max_age: float) -> None: - """Remove files in the cache directory older than max_age seconds. +def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: + """Remove files in the cache directory older than `max_age_seconds`. Also removes any files that somehow have a modification time in the future. @@ -222,6 +228,6 @@ def clear_old_files_from_cache(cache: Path, *, max_age: float) -> None: for file in cache.iterdir(): mtime = file.stat().st_mtime age = time.time() - mtime - if age < 0 or age > max_age: + if age < 0 or age > max_age_seconds: logger.debug("Removing old cache file %s", file) file.unlink() From 8d3041052629ea0c86b51d4fbbb90a42b2cb16ad Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 10:01:05 +0100 Subject: [PATCH 20/28] Move caching-related functionality to a separate module --- conda_lock/lookup.py | 143 +---------------------------------- conda_lock/lookup_cache.py | 150 +++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 141 deletions(-) create mode 100644 conda_lock/lookup_cache.py diff --git a/conda_lock/lookup.py b/conda_lock/lookup.py index 2a3525ed0..81746f162 100644 --- a/conda_lock/lookup.py +++ b/conda_lock/lookup.py @@ -1,31 +1,21 @@ -import hashlib import json import logging -import re import time from functools import lru_cache from pathlib import Path from typing import Dict, TypedDict -import requests - -from filelock import FileLock, Timeout from packaging.utils import NormalizedName from packaging.utils import canonicalize_name as canonicalize_pypi_name -from platformdirs import user_cache_path + +from conda_lock.lookup_cache import cached_download_file logger = logging.getLogger(__name__) DEFAULT_MAPPING_URL = "https://raw.githubusercontent.com/regro/cf-graph-countyfair/master/mappings/pypi/grayskull_pypi_mapping.json" -CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days -"""Cached files older than this will be deleted.""" - -DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes -"""If the cached file is newer than this, just use it without checking for updates.""" - class MappingEntry(TypedDict): conda_name: str @@ -102,132 +92,3 @@ def conda_name_to_pypi_name(name: str, mapping_url: str) -> NormalizedName: lookup = _get_conda_lookup(mapping_url=mapping_url) cname = canonicalize_pypi_name(name) return lookup.get(cname, {"pypi_name": cname})["pypi_name"] - - -def cached_download_file( - url: str, - *, - cache_subdir_name: str, - max_age_seconds: float = CLEAR_CACHE_AFTER_SECONDS, - dont_check_if_newer_than_seconds: float = DONT_CHECK_IF_NEWER_THAN_SECONDS, -) -> bytes: - """Download a file and cache it in the user cache directory. - - If the file is already cached, return the cached contents. - If the file is not cached, download it and cache the contents - and the ETag. - - Protect against multiple processes downloading the same file. - """ - cache = user_cache_path("conda-lock", appauthor=False) / "cache" / cache_subdir_name - cache.mkdir(parents=True, exist_ok=True) - clear_old_files_from_cache(cache, max_age_seconds=max_age_seconds) - - destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") - - # Wait for any other process to finish downloading the file. - # This way we can use the result from the current download without - # spawning multiple concurrent downloads. - while True: - try: - with FileLock(destination_lock, timeout=5): - return _download_to_or_read_from_cache( - url, - cache=cache, - dont_check_if_newer_than_seconds=dont_check_if_newer_than_seconds, - ) - except Timeout: - logger.warning( - f"Failed to acquire lock on {destination_lock}, it is likely " - f"being downloaded by another process. Retrying..." - ) - - -def _download_to_or_read_from_cache( - url: str, *, cache: Path, dont_check_if_newer_than_seconds: float -) -> bytes: - """Download a file to the cache directory and return the contents. - - This function is designed to be called from within a FileLock context to avoid - multiple processes downloading the same file. - - If the file is newer than `dont_check_if_newer_than_seconds`, then immediately - return the cached contents. Otherwise we pass the ETag from the last download - in the headers to avoid downloading the file if it hasn't changed remotely. - """ - destination = cache / cached_filename_for_url(url) - destination_etag = destination.with_suffix(".etag") - request_headers = {} - # Return the contents immediately if the file is fresh - if destination.is_file(): - mtime = destination.stat().st_mtime - age = time.time() - mtime - if 0 <= age <= DONT_CHECK_IF_NEWER_THAN_SECONDS: - logger.debug( - f"Using cached mapping {destination} without checking for updates" - ) - return destination.read_bytes() - # Add the ETag from the last download, if it exists, to the headers. - # The ETag is used to avoid downloading the file if it hasn't changed remotely. - # Otherwise, download the file and cache the contents and ETag. - if destination_etag.is_file(): - old_etag = destination_etag.read_text().strip() - request_headers["If-None-Match"] = old_etag - # Download the file and cache the result. - logger.debug(f"Requesting {url}") - res = requests.get(url, headers=request_headers) - if res.status_code == 304: - logger.debug(f"{url} has not changed since last download, using {destination}") - else: - res.raise_for_status() - destination.write_bytes(res.content) - if "ETag" in res.headers: - destination_etag.write_text(res.headers["ETag"]) - else: - logger.warning("No ETag in response headers") - logger.debug(f"Downloaded {url} to {destination}") - return destination.read_bytes() - - -def cached_filename_for_url(url: str) -> str: - """Return a filename for a URL that is probably unique to the URL. - - The filename is a 4-character hash of the URL, followed by the extension. - If the extension is not alphanumeric or too long, it is omitted. - - >>> cached_filename_for_url("https://example.com/foo.json") - 'a5d7.json' - >>> cached_filename_for_url("https://example.com/foo") - '5ea6' - >>> cached_filename_for_url("https://example.com/foo.bär") - '2191' - >>> cached_filename_for_url("https://example.com/foo.baaaaaar") - '1861' - """ - url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] - extension = url.split(".")[-1] - if len(extension) <= 6 and re.match("^[a-zA-Z0-9]+$", extension): - return f"{url_hash}.{extension}" - else: - return f"{url_hash}" - - -def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: - """Remove files in the cache directory older than `max_age_seconds`. - - Also removes any files that somehow have a modification time in the future. - - For safety, this raises an error if `cache` is not a subdirectory of - a directory named `"cache"`. - """ - if not cache.parent.name == "cache": - raise ValueError( - f"Expected cache directory, got {cache}. Parent should be 'cache' ", - f"not '{cache.parent.name}'", - ) - for file in cache.iterdir(): - mtime = file.stat().st_mtime - age = time.time() - mtime - if age < 0 or age > max_age_seconds: - logger.debug("Removing old cache file %s", file) - file.unlink() diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py new file mode 100644 index 000000000..36f25339f --- /dev/null +++ b/conda_lock/lookup_cache.py @@ -0,0 +1,150 @@ +import hashlib +import logging +import re +import time + +from pathlib import Path + +import requests + +from filelock import FileLock, Timeout +from platformdirs import user_cache_path + + +logger = logging.getLogger(__name__) + + +CLEAR_CACHE_AFTER_SECONDS = 60 * 60 * 24 * 2 # 2 days +"""Cached files older than this will be deleted.""" + +DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes +"""If the cached file is newer than this, just use it without checking for updates.""" + + +def cached_download_file( + url: str, + *, + cache_subdir_name: str, + max_age_seconds: float = CLEAR_CACHE_AFTER_SECONDS, + dont_check_if_newer_than_seconds: float = DONT_CHECK_IF_NEWER_THAN_SECONDS, +) -> bytes: + """Download a file and cache it in the user cache directory. + + If the file is already cached, return the cached contents. + If the file is not cached, download it and cache the contents + and the ETag. + + Protect against multiple processes downloading the same file. + """ + cache = user_cache_path("conda-lock", appauthor=False) / "cache" / cache_subdir_name + cache.mkdir(parents=True, exist_ok=True) + clear_old_files_from_cache(cache, max_age_seconds=max_age_seconds) + + destination_lock = (cache / cached_filename_for_url(url)).with_suffix(".lock") + + # Wait for any other process to finish downloading the file. + # This way we can use the result from the current download without + # spawning multiple concurrent downloads. + while True: + try: + with FileLock(destination_lock, timeout=5): + return _download_to_or_read_from_cache( + url, + cache=cache, + dont_check_if_newer_than_seconds=dont_check_if_newer_than_seconds, + ) + except Timeout: + logger.warning( + f"Failed to acquire lock on {destination_lock}, it is likely " + f"being downloaded by another process. Retrying..." + ) + + +def _download_to_or_read_from_cache( + url: str, *, cache: Path, dont_check_if_newer_than_seconds: float +) -> bytes: + """Download a file to the cache directory and return the contents. + + This function is designed to be called from within a FileLock context to avoid + multiple processes downloading the same file. + + If the file is newer than `dont_check_if_newer_than_seconds`, then immediately + return the cached contents. Otherwise we pass the ETag from the last download + in the headers to avoid downloading the file if it hasn't changed remotely. + """ + destination = cache / cached_filename_for_url(url) + destination_etag = destination.with_suffix(".etag") + request_headers = {} + # Return the contents immediately if the file is fresh + if destination.is_file(): + mtime = destination.stat().st_mtime + age = time.time() - mtime + if 0 <= age <= DONT_CHECK_IF_NEWER_THAN_SECONDS: + logger.debug( + f"Using cached mapping {destination} without checking for updates" + ) + return destination.read_bytes() + # Add the ETag from the last download, if it exists, to the headers. + # The ETag is used to avoid downloading the file if it hasn't changed remotely. + # Otherwise, download the file and cache the contents and ETag. + if destination_etag.is_file(): + old_etag = destination_etag.read_text().strip() + request_headers["If-None-Match"] = old_etag + # Download the file and cache the result. + logger.debug(f"Requesting {url}") + res = requests.get(url, headers=request_headers) + if res.status_code == 304: + logger.debug(f"{url} has not changed since last download, using {destination}") + else: + res.raise_for_status() + destination.write_bytes(res.content) + if "ETag" in res.headers: + destination_etag.write_text(res.headers["ETag"]) + else: + logger.warning("No ETag in response headers") + logger.debug(f"Downloaded {url} to {destination}") + return destination.read_bytes() + + +def cached_filename_for_url(url: str) -> str: + """Return a filename for a URL that is probably unique to the URL. + + The filename is a 4-character hash of the URL, followed by the extension. + If the extension is not alphanumeric or too long, it is omitted. + + >>> cached_filename_for_url("https://example.com/foo.json") + 'a5d7.json' + >>> cached_filename_for_url("https://example.com/foo") + '5ea6' + >>> cached_filename_for_url("https://example.com/foo.bär") + '2191' + >>> cached_filename_for_url("https://example.com/foo.baaaaaar") + '1861' + """ + url_hash = hashlib.sha256(url.encode()).hexdigest()[:4] + extension = url.split(".")[-1] + if len(extension) <= 6 and re.match("^[a-zA-Z0-9]+$", extension): + return f"{url_hash}.{extension}" + else: + return f"{url_hash}" + + +def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: + """Remove files in the cache directory older than `max_age_seconds`. + + Also removes any files that somehow have a modification time in the future. + + For safety, this raises an error if `cache` is not a subdirectory of + a directory named `"cache"`. + """ + if not cache.parent.name == "cache": + raise ValueError( + f"Expected cache directory, got {cache}. Parent should be 'cache' ", + f"not '{cache.parent.name}'", + ) + for file in cache.iterdir(): + mtime = file.stat().st_mtime + age = time.time() - mtime + if age < 0 or age > max_age_seconds: + logger.debug("Removing old cache file %s", file) + file.unlink() From e7ad77e2d7466471641619bb956c2b547616f8eb Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 20:00:13 +0100 Subject: [PATCH 21/28] Fix ignored dont_check_if_newer_than_seconds parameter --- conda_lock/lookup_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 36f25339f..99d57d6c4 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -79,7 +79,7 @@ def _download_to_or_read_from_cache( if destination.is_file(): mtime = destination.stat().st_mtime age = time.time() - mtime - if 0 <= age <= DONT_CHECK_IF_NEWER_THAN_SECONDS: + if 0 <= age <= dont_check_if_newer_than_seconds: logger.debug( f"Using cached mapping {destination} without checking for updates" ) From de2d2ffa5329809640eab47a8ad8438ce7e71a18 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 10:03:09 +0100 Subject: [PATCH 22/28] Add uncached_download_file --- conda_lock/lookup_cache.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 99d57d6c4..6aa594829 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -21,6 +21,13 @@ """If the cached file is newer than this, just use it without checking for updates.""" +def uncached_download_file(url: str) -> bytes: + """The simple equivalent to cached_download_file.""" + res = requests.get(url) + res.raise_for_status() + return res.content + + def cached_download_file( url: str, *, From 1e7e8438e09fb8776c2312b5fc1b2504f0776e09 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 11:12:02 +0100 Subject: [PATCH 23/28] Replace time.time() with datetime.now().timestamp() --- conda_lock/lookup_cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 6aa594829..72c02f5ac 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -1,8 +1,8 @@ import hashlib import logging import re -import time +from datetime import datetime from pathlib import Path import requests @@ -85,7 +85,7 @@ def _download_to_or_read_from_cache( # Return the contents immediately if the file is fresh if destination.is_file(): mtime = destination.stat().st_mtime - age = time.time() - mtime + age = datetime.now().timestamp() - mtime if 0 <= age <= dont_check_if_newer_than_seconds: logger.debug( f"Using cached mapping {destination} without checking for updates" @@ -151,7 +151,7 @@ def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: ) for file in cache.iterdir(): mtime = file.stat().st_mtime - age = time.time() - mtime + age = datetime.now().timestamp() - mtime if age < 0 or age > max_age_seconds: logger.debug("Removing old cache file %s", file) file.unlink() From dd95d1f1ee72f93f94835028f9f37e1d45c38e23 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 19:57:45 +0100 Subject: [PATCH 24/28] Add User-Agent header --- conda_lock/lookup_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 72c02f5ac..6668561c6 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -23,7 +23,7 @@ def uncached_download_file(url: str) -> bytes: """The simple equivalent to cached_download_file.""" - res = requests.get(url) + res = requests.get(url, headers={"User-Agent": "conda-lock"}) res.raise_for_status() return res.content @@ -81,7 +81,7 @@ def _download_to_or_read_from_cache( """ destination = cache / cached_filename_for_url(url) destination_etag = destination.with_suffix(".etag") - request_headers = {} + request_headers = {"User-Agent": "conda-lock"} # Return the contents immediately if the file is fresh if destination.is_file(): mtime = destination.stat().st_mtime From 9998291144b808dab5a2e345947f818e7fdd6d54 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 20:10:38 +0100 Subject: [PATCH 25/28] Parameterize cache root --- conda_lock/lookup_cache.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 6668561c6..3b22dd33e 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -4,6 +4,7 @@ from datetime import datetime from pathlib import Path +from typing import Optional import requests @@ -32,6 +33,7 @@ def cached_download_file( url: str, *, cache_subdir_name: str, + cache_root: Optional[Path] = None, max_age_seconds: float = CLEAR_CACHE_AFTER_SECONDS, dont_check_if_newer_than_seconds: float = DONT_CHECK_IF_NEWER_THAN_SECONDS, ) -> bytes: @@ -43,7 +45,9 @@ def cached_download_file( Protect against multiple processes downloading the same file. """ - cache = user_cache_path("conda-lock", appauthor=False) / "cache" / cache_subdir_name + if cache_root is None: + cache_root = user_cache_path("conda-lock", appauthor=False) + cache = cache_root / "cache" / cache_subdir_name cache.mkdir(parents=True, exist_ok=True) clear_old_files_from_cache(cache, max_age_seconds=max_age_seconds) @@ -54,7 +58,7 @@ def cached_download_file( # spawning multiple concurrent downloads. while True: try: - with FileLock(destination_lock, timeout=5): + with FileLock(str(destination_lock), timeout=5): return _download_to_or_read_from_cache( url, cache=cache, From fa4b914a6b9d2f1361ad52ff44bce0903f477498 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Mon, 28 Oct 2024 21:28:38 +0100 Subject: [PATCH 26/28] Tolerate negative file age for Windows --- conda_lock/lookup_cache.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index 3b22dd33e..fa184747d 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -1,5 +1,6 @@ import hashlib import logging +import platform import re from datetime import datetime @@ -21,6 +22,11 @@ DONT_CHECK_IF_NEWER_THAN_SECONDS = 60 * 5 # 5 minutes """If the cached file is newer than this, just use it without checking for updates.""" +WINDOWS_TIME_EPSILON = 0.005 +"""Windows has issues with file timestamps, so we add this small offset +to ensure that newly created files have a positive age. +""" + def uncached_download_file(url: str) -> bytes: """The simple equivalent to cached_download_file.""" @@ -88,11 +94,11 @@ def _download_to_or_read_from_cache( request_headers = {"User-Agent": "conda-lock"} # Return the contents immediately if the file is fresh if destination.is_file(): - mtime = destination.stat().st_mtime - age = datetime.now().timestamp() - mtime - if 0 <= age <= dont_check_if_newer_than_seconds: + age_seconds = get_age_seconds(destination) + if 0 <= age_seconds < dont_check_if_newer_than_seconds: logger.debug( - f"Using cached mapping {destination} without checking for updates" + f"Using cached mapping {destination} of age {age_seconds}s " + f"without checking for updates" ) return destination.read_bytes() # Add the ETag from the last download, if it exists, to the headers. @@ -154,8 +160,20 @@ def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: f"not '{cache.parent.name}'", ) for file in cache.iterdir(): - mtime = file.stat().st_mtime - age = datetime.now().timestamp() - mtime - if age < 0 or age > max_age_seconds: + age_seconds = get_age_seconds(file) + if age_seconds < 0 or age_seconds >= max_age_seconds: logger.debug("Removing old cache file %s", file) file.unlink() + + +def get_age_seconds(path: Path) -> float: + """Return the age of a file in seconds. + + On Windows, the age of a new file is sometimes slightly negative, so we add a small + offset to ensure that the age is positive. + """ + raw_age = datetime.now().timestamp() - path.stat().st_mtime + if platform.system() == "Windows": + return raw_age + WINDOWS_TIME_EPSILON + else: + return raw_age From a334fc6aee8a2e7c13cc9ea638e240a0e6ce065f Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Tue, 29 Oct 2024 08:47:40 +0100 Subject: [PATCH 27/28] Include age information when deleting cached file --- conda_lock/lookup_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_lock/lookup_cache.py b/conda_lock/lookup_cache.py index fa184747d..a5ee6fda4 100644 --- a/conda_lock/lookup_cache.py +++ b/conda_lock/lookup_cache.py @@ -162,7 +162,7 @@ def clear_old_files_from_cache(cache: Path, *, max_age_seconds: float) -> None: for file in cache.iterdir(): age_seconds = get_age_seconds(file) if age_seconds < 0 or age_seconds >= max_age_seconds: - logger.debug("Removing old cache file %s", file) + logger.debug(f"Removing old cache file {file} of age {age_seconds}s") file.unlink() From 5f3c2671c148531612e9ef53762477a062951138 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sun, 27 Oct 2024 17:47:07 +0100 Subject: [PATCH 28/28] Add tests for lookup cache Add diagnostic info about queues --- tests/test_lookup_cache.py | 322 +++++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 tests/test_lookup_cache.py diff --git a/tests/test_lookup_cache.py b/tests/test_lookup_cache.py new file mode 100644 index 000000000..004aaf801 --- /dev/null +++ b/tests/test_lookup_cache.py @@ -0,0 +1,322 @@ +import os +import queue +import threading +import time + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from conda_lock.lookup import DEFAULT_MAPPING_URL +from conda_lock.lookup_cache import ( + cached_download_file, + clear_old_files_from_cache, + uncached_download_file, +) + + +@pytest.fixture +def mock_cache_dir(tmp_path): + cache_dir = tmp_path / "cache" / "test_cache" + cache_dir.mkdir(parents=True) + return cache_dir + + +@pytest.mark.parametrize("use_caching_function", [True, False]) +def test_download_file_uncached(tmp_path, use_caching_function): + with patch("requests.get") as mock_get: + mock_response = MagicMock() + mock_response.content = b"test content" + mock_get.return_value = mock_response + + if use_caching_function: + result = cached_download_file( + "https://example.com/test", + cache_subdir_name="test_cache", + cache_root=tmp_path, + ) + else: + result = uncached_download_file("https://example.com/test") + + assert result == b"test content" + mock_get.assert_called_once_with( + "https://example.com/test", headers={"User-Agent": "conda-lock"} + ) + mock_response.raise_for_status.assert_called_once() + + +def test_clear_old_files_from_cache(mock_cache_dir): + """Verify that files older than the max age are removed.""" + old_file = mock_cache_dir / "old_file.txt" + recent_file = mock_cache_dir / "recent_file.txt" + future_file = mock_cache_dir / "future_file.txt" + + old_file.touch() + recent_file.touch() + future_file.touch() + + # Set the modification and access times of each file + t = time.time() + os.utime(old_file, (t - 100, t - 100)) + os.utime(recent_file, (t - 20, t - 20)) + os.utime(future_file, (t + 100, t + 100)) + + clear_old_files_from_cache(mock_cache_dir, max_age_seconds=22) + + # Only the recent file is in the correct time range + assert not old_file.exists() + assert recent_file.exists() + assert not future_file.exists() + + # Immediately rerunning it again should not change anything + clear_old_files_from_cache(mock_cache_dir, max_age_seconds=22) + assert recent_file.exists() + + # Lowering the max age should remove the file + clear_old_files_from_cache(mock_cache_dir, max_age_seconds=20) + assert not recent_file.exists() + + +def test_clear_old_files_from_cache_invalid_directory(tmp_path): + """Verify that only paths within a 'cache' directory are accepted. + + This is a safety measure to prevent accidental deletion of files + outside of a cache directory. + """ + valid_cache_dir = tmp_path / "cache" / "valid" + invalid_cache_dir = tmp_path / "not-cache" / "invalid" + + valid_cache_dir.mkdir(parents=True) + invalid_cache_dir.mkdir(parents=True) + clear_old_files_from_cache(valid_cache_dir, max_age_seconds=10) + with pytest.raises(ValueError): + clear_old_files_from_cache(Path(invalid_cache_dir), max_age_seconds=10) + + +def test_cached_download_file(tmp_path): + """Simulate an interaction with a remote server to test the cache. + + * Download the file for the first time + * Retrieve the file again immediately (should be cached without sending a request) + * Retrieve the file again twice more but check that the remote file has been updated + (should get 304 Not Modified and return the cached version) + * Retrieve the file again but check that the remote file has been updated + (should get 200 OK and return the updated version) + * Retrieve the file again immediately (should be cached without sending a request) + """ + url = "https://example.com/test.json" + with patch("requests.get") as mock_get: + mock_response = MagicMock() + mock_response.content = b"previous content" + mock_response.status_code = 200 + mock_response.headers = {"ETag": "previous-etag"} + mock_get.return_value = mock_response + + # Warm the cache + result = cached_download_file( + url, cache_subdir_name="test_cache", cache_root=tmp_path + ) + assert result == b"previous content" + assert mock_get.call_count == 1 + # No ETag should have been sent because we downloaded for the first time + assert mock_get.call_args[1]["headers"].get("If-None-Match") is None + + # Calling again immediately should directly return the cached result + # without sending a new request + result = cached_download_file( + url, cache_subdir_name="test_cache", cache_root=tmp_path + ) + assert result == b"previous content" + assert mock_get.call_count == 1 + + # Now we test HTTP 304 Not Modified + # We trigger a request by setting dont_check_if_newer_than_seconds to 0 + with patch("requests.get") as mock_get: + mock_response = MagicMock() + mock_response.content = b"Should be ignored" + mock_response.status_code = 304 + mock_response.headers = {"ETag": "Should be ignored"} + mock_get.return_value = mock_response + + for call_count in range(1, 2 + 1): + # This time we should send the ETag and get a 304 + result = cached_download_file( + url, + cache_subdir_name="test_cache", + cache_root=tmp_path, + dont_check_if_newer_than_seconds=0, + ) + assert result == b"previous content" + assert mock_get.call_count == call_count + assert ( + mock_get.call_args[1]["headers"].get("If-None-Match") == "previous-etag" + ) + + # Now we test HTTP 200 OK with a new ETag to simulate the remote file being updated + with patch("requests.get") as mock_get: + mock_response = MagicMock() + mock_response.content = b"new content" + mock_response.status_code = 200 + mock_response.headers = {"ETag": "new-etag"} + mock_get.return_value = mock_response + + result = cached_download_file( + url, + cache_subdir_name="test_cache", + cache_root=tmp_path, + dont_check_if_newer_than_seconds=0, + ) + assert result == b"new content" + assert mock_get.call_count == 1 + assert mock_get.call_args[1]["headers"].get("If-None-Match") == "previous-etag" + + # Verify that we picked up the new content and sent the new ETag + with patch("requests.get") as mock_get: + mock_response = MagicMock() + mock_response.content = b"Should be ignored" + mock_response.status_code = 304 + mock_response.headers = {"ETag": "Should be ignored"} + mock_get.return_value = mock_response + + result = cached_download_file( + url, + cache_subdir_name="test_cache", + cache_root=tmp_path, + dont_check_if_newer_than_seconds=0, + ) + assert result == b"new content" + assert mock_get.call_count == 1 + assert mock_get.call_args[1]["headers"].get("If-None-Match") == "new-etag" + + # Verify that we return the updated content without sending a new request + result = cached_download_file( + url, + cache_subdir_name="test_cache", + cache_root=tmp_path, + ) + assert result == b"new content" + assert mock_get.call_count == 1 + + +def test_download_mapping_file(tmp_path): + """Verify that we can download the actual mapping file and that it is cached.""" + url = DEFAULT_MAPPING_URL + from requests import get as requests_get + + responses: list[requests.Response] = [] + + def wrapped_get(*args, **kwargs): + """Wrap requests.get to capture the response.""" + response = requests_get(*args, **kwargs) + responses.append(response) + return response + + # Initial download and cache + with patch("requests.get", wraps=wrapped_get) as mock_get: + result = cached_download_file( + url, cache_subdir_name="test_cache", cache_root=tmp_path + ) + # Ensure the response is valid and content is as expected + assert len(responses) == 1 + response = responses[0] + assert response.status_code == 200 + assert len(response.content) > 10000 + assert response.content == result + + # Verify that the file is retrieved from cache + with patch("requests.get", wraps=wrapped_get) as mock_get: + result2 = cached_download_file( + url, cache_subdir_name="test_cache", cache_root=tmp_path + ) + mock_get.assert_not_called() + assert result == result2 + + # Force cache refresh and verify ETag handling + with patch("requests.get", wraps=wrapped_get) as mock_get: + result3 = cached_download_file( + url, + cache_subdir_name="test_cache", + cache_root=tmp_path, + dont_check_if_newer_than_seconds=0, + ) + # Ensure the request is made and the response is 304 Not Modified + assert len(responses) == 2 + response = responses[1] + assert response is not None + mock_get.assert_called_once() + assert response.status_code == 304 + assert len(response.content) == 0 + assert result == result2 == result3 + + +def test_concurrent_cached_download_file(tmp_path): + """Test concurrent access to cached_download_file with 5 threads.""" + url = "https://example.com/test.json" + results: queue.Queue[bytes] = queue.Queue() + thread_names_emitting_lock_warnings: queue.Queue[str] = queue.Queue() + thread_names_calling_requests_get: queue.Queue[str] = queue.Queue() + + def mock_get(*args, **kwargs): + time.sleep(5.2) + response = MagicMock() + response.content = b"content" + response.status_code = 200 + thread_name = threading.current_thread().name + thread_names_calling_requests_get.put(thread_name) + return response + + def download_file(result_queue): + """Download the file in a thread and store the result in a queue.""" + import random + + # Randomize which thread calls cached_download_file first + time.sleep(random.uniform(0, 0.1)) + result = cached_download_file( + url, cache_subdir_name="test_cache", cache_root=tmp_path + ) + result_queue.put(result) + + with patch("requests.get", side_effect=mock_get) as mock_get, patch( + "conda_lock.lookup_cache.logger" + ) as mock_logger: + # Set up the logger to record which threads emit warnings + def mock_warning(msg, *args, **kwargs): + if "Failed to acquire lock" in msg: + thread_names_emitting_lock_warnings.put(threading.current_thread().name) + + mock_logger.warning.side_effect = mock_warning + + # Create and start 5 threads + thread_names = [f"CachedDownloadFileThread-{i}" for i in range(5)] + threads = [ + threading.Thread(target=download_file, args=(results,), name=thread_name) + for thread_name in thread_names + ] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + # Collect results from the queue + assert results.qsize() == len(threads) + assert all(result == b"content" for result in results.queue) + + # We expect one thread to have made the request and the other four + # to have emitted warnings. + assert ( + thread_names_calling_requests_get.qsize() + == 1 + == len(set(thread_names_calling_requests_get.queue)) + == mock_get.call_count + ), f"{thread_names_calling_requests_get.queue=}" + assert ( + thread_names_emitting_lock_warnings.qsize() + == 4 + == len(set(thread_names_emitting_lock_warnings.queue)) + ), f"{thread_names_emitting_lock_warnings.queue=}" + assert set(thread_names) == set( + thread_names_calling_requests_get.queue + + thread_names_emitting_lock_warnings.queue + )