Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new source hashing methods: content_sha256, content_sha384, content_sha512 #5277

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
eac67a9
add content_sha256 hash checks
jaimergp Apr 12, 2024
af571af
fix algo id
jaimergp Apr 12, 2024
08d7691
pre-commit
jaimergp Apr 12, 2024
19235f1
extend tests and include path, type and executable bit in the hash
jaimergp Apr 13, 2024
704ba21
make it cross-platform
jaimergp Apr 15, 2024
0426db2
add news
jaimergp Apr 15, 2024
47fe18d
use dash separator
jaimergp Apr 15, 2024
ab810a4
update hashes
jaimergp Apr 15, 2024
91d3a4d
Merge branch 'main' into content-hash
jaimergp Jun 18, 2024
4e0f6dd
Update source.py
jaimergp Jun 18, 2024
002b309
Merge branch 'main' of github.com:conda/conda-build into content-hash
jaimergp Nov 19, 2024
1439e4e
change algorithm a bit and update tests
jaimergp Nov 19, 2024
4f4178b
move to Path.rglob() and allow skips
jaimergp Nov 20, 2024
190e120
register new keys
jaimergp Nov 20, 2024
4b3d56d
update recipe
jaimergp Nov 20, 2024
f513069
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 20, 2024
c409505
add docs
jaimergp Nov 20, 2024
5327e4a
pre-commit
jaimergp Nov 20, 2024
27b9eaf
normalize line endings
jaimergp Nov 20, 2024
73a23ae
prevent partial hash changes in hybrid text/binary files
jaimergp Nov 20, 2024
16260bf
sort by str, not Path
jaimergp Nov 23, 2024
b7f59ef
use separate git cache for this one
jaimergp Nov 24, 2024
c9b7e7b
override src_cache_root instead
jaimergp Nov 24, 2024
b7a4547
pre-commit
jaimergp Nov 24, 2024
50b219e
revert
jaimergp Nov 25, 2024
f60bcdd
force checkout
jaimergp Nov 25, 2024
ffcda69
try with constructor
jaimergp Nov 25, 2024
d434dce
stop force
jaimergp Nov 25, 2024
a950958
add `?` separator for unknown file types
jaimergp Nov 26, 2024
d97d081
pre-commit
jaimergp Nov 26, 2024
36f23c3
Merge branch 'main' of github.com:conda/conda-build into content-hash
jaimergp Nov 26, 2024
2afa293
drop content_{md5,sha1} and add content_{sha384,sha512}
jaimergp Nov 26, 2024
98d8813
add here too
jaimergp Nov 26, 2024
c192799
use a 10MB SpooledTemporaryFile
jaimergp Nov 27, 2024
5edfb20
pre-commit
jaimergp Nov 27, 2024
bcc7ad5
do error on unreadable files and unknown types
jaimergp Nov 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions conda_build/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,10 @@ def parse(data, config, path=None):
"sha256": None,
"sha384": None,
"sha512": None,
"content_sha256": None,
"content_sha384": None,
"content_sha512": None,
"content_hash_skip": list,
"path": str,
"path_via_symlink": None,
"git_url": str,
Expand Down
32 changes: 26 additions & 6 deletions conda_build/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
LoggingContext,
check_call_env,
check_output_env,
compute_content_hash,
convert_path_for_cygwin_or_msys2,
convert_unix_path_to_win,
copy_into,
Expand All @@ -46,9 +47,8 @@

git_submod_re = re.compile(r"(?:.+)\.(.+)\.(?:.+)\s(.+)")
ext_re = re.compile(r"(.*?)(\.(?:tar\.)?[^.]+)$")


ACCEPTED_HASH_TYPES = ("md5", "sha1", "sha224", "sha256", "sha384", "sha512")
CONTENT_HASH_KEYS = ("content_sha256", "content_sha384", "content_sha512")


def append_hash_to_fn(fn, hash_value):
Expand Down Expand Up @@ -78,9 +78,10 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
break
else:
log.warning(
f"No hash {ACCEPTED_HASH_TYPES} provided for {unhashed_fn}. Source download forced. "
f"No hash {ACCEPTED_HASH_TYPES} provided for {unhashed_fn}. Source download forced. "
"Add hash to recipe to use source cache."
)

path = join(cache_folder, fn)
if isfile(path):
if verbose:
Expand Down Expand Up @@ -119,15 +120,15 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
raise RuntimeError(f"Could not download {url}")

hashed = None

for hash_type in set(source_dict).intersection(ACCEPTED_HASH_TYPES):
if hash_type in source_dict:
expected_hash = source_dict[hash_type]
hashed = compute_sum(path, hash_type)
if expected_hash != hashed:
rm_rf(path)
raise RuntimeError(
f"{hash_type.upper()} mismatch: '{hashed}' != '{expected_hash}'"
f"{hash_type.upper()} mismatch for {unhashed_fn}: "
f"obtained '{hashed}' != expected '{expected_hash}'"
)

# this is really a fallback. If people don't provide the hash, we still need to prevent
Expand Down Expand Up @@ -1033,7 +1034,7 @@ def provide(metadata):
git = None

try:
for source_dict in metadata.get_section("source"):
for idx, source_dict in enumerate(metadata.get_section("source")):
folder = source_dict.get("folder")
src_dir = os.path.join(metadata.config.work_dir, folder if folder else "")
if any(k in source_dict for k in ("fn", "url")):
Expand Down Expand Up @@ -1112,6 +1113,25 @@ def provide(metadata):
if not isdir(src_dir):
os.makedirs(src_dir)

for hash_type in CONTENT_HASH_KEYS:
if hash_type in source_dict:
expected_content_hash = source_dict[hash_type]
if expected_content_hash in (None, ""):
raise ValueError(
f"Empty {hash_type} hash provided for source item #{idx}"
)
algorithm = hash_type[len("content_") :]
obtained_content_hash = compute_content_hash(
src_dir,
algorithm,
skip=ensure_list(source_dict.get("content_hash_skip") or ()),
)
if expected_content_hash != obtained_content_hash:
raise RuntimeError(
f"{hash_type} mismatch in source item #{idx}: "
beckermr marked this conversation as resolved.
Show resolved Hide resolved
f"obtained '{obtained_content_hash}' != "
f"expected '{expected_content_hash}'"
)
patches = ensure_list(source_dict.get("patches", []))
patch_attributes_output = []
for patch in patches:
Expand Down
96 changes: 95 additions & 1 deletion conda_build/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import urllib.request as urllib
from collections import OrderedDict, defaultdict
from collections.abc import Iterable
from functools import cache
from functools import cache, partial
from glob import glob
from io import StringIO
from itertools import filterfalse
Expand Down Expand Up @@ -1987,6 +1987,100 @@ def sha256_checksum(filename, buffersize=65536):
return sha256.hexdigest()


def compute_content_hash(
directory: str | Path, algorithm="sha256", skip: Iterable[str] = ()
) -> str:
"""
Given a directory, recursively scan all its contents (without following symlinks) and sort them
by their full path. For each entry in the contents table, compute the hash for the concatenated
bytes of:

- UTF-8 encoded path, relative to the input directory. Backslashes are normalized
to forward slashes before encoding.
- Then, depending on the type:
- For regular files, the UTF-8 bytes of an `F` separator, followed by:
- UTF-8 bytes of the line-ending normalized text (`\r\n` to `\n`), if the file is text.
- The raw bytes of the file contents, if binary.
- Note: If the file can't be opened or read, no contents are hashed;
it's treated as empty.
- For a directory, the UTF-8 bytes of a `D` separator, and nothing else.
- For a symlink, the UTF-8 bytes of an `L` separator, followed by the UTF-8 encoded bytes
for the path it points to. Backslashes MUST be normalized to forward slashes before
encoding.
- For any other types, the UTF-8 bytes of a `?` separator, and nothing else.
- UTF-8 encoded bytes of the string `-`, as separator.

Parameters
----------
directory: The path whose contents will be hashed
algorithm: Name of the algorithm to be used, as expected by `hashlib.new()`
skip: iterable of paths that should not be checked. If a path ends with a slash, it's
interpreted as a directory that won't be traversed. It matches the relative paths
already slashed-normalized (i.e. backwards slashes replaced with forward slashes).

Returns
-------
str
The hexdigest of the computed hash, as described above.
"""
log = get_logger(__name__)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move that to the top of the module per best practice

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was following the practice followed in the other functions, fwiw.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, now I see why. The get_logger utility is defined in that module, so there's no top-level function to use. If anything, it would go at the bottom of the module? Do you prefer that or shall we leave it in-function?

hasher = hashlib.new(algorithm)
for path in sorted(Path(directory).rglob("*"), key=str):
relpath = path.relative_to(directory)
relpathstr = str(relpath).replace("\\", "/")
if skip and any(
(
# Skip directories like .git/
skip_item.endswith("/")
and relpathstr.startswith(skip_item)
or f"{relpathstr}/" == skip_item
)
# Skip full relpath match
or relpathstr == skip_item
for skip_item in skip
):
continue
# encode the relative path to directory, for files, dirs and others
hasher.update(relpathstr.encode("utf-8"))
if path.is_symlink():
hasher.update(b"L")
hasher.update(str(path.readlink()).replace("\\", "/").encode("utf-8"))
elif path.is_dir():
hasher.update(b"D")
elif path.is_file():
hasher.update(b"F")
# We need to normalize line endings for Windows-Unix compat
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# Attempt normalized line-by-line hashing (text mode). If
# Python fails to open in text mode, then it's binary and we hash
# the raw bytes directly.
try:
try:
Comment on lines +2054 to +2055
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That second try block isn't needed, since we can catch multiple exception types in the same block

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but I want to catch the potential OSError in the except UnicodeDecodeError arm. Will that raised exception be caught in the try/except block? IOW, will this print "Hello"? I don't think it does:

try:
  raise ValueError
except ValueError:
  raise RuntimeError
except RuntimeError:
  print("Hellow!")

lines = []
with open(path) as fh:
for line in fh:
# Accumulate all line-ending normalized lines first
# to make sure the whole file is read. This prevents
# partial updates to the hash with hybrid text/binary
# files (e.g. like the constructor shell installers).
lines.append(line.replace("\r\n", "\n"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, that might be a memory hog, depending on how big the files are you're normalizing, it might be best to write this to a temp file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, good point. Didn't the stdlib have a temporary file object that only writes to disk after a certain size? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, SpooledTemporaryFile. Added it in c192799 (#5277)

for line in lines:
hasher.update(line.encode("utf-8"))
except UnicodeDecodeError:
# file must be binary, read the bytes directly
with open(path, "rb") as fh:
for chunk in iter(partial(fh.read, 8192), b""):
hasher.update(chunk)
except OSError as exc:
log.warning(
"Can't open file %s. Hashing path only...", path.name, exc_info=exc
)
else:
log.warning("Can't detect type for path %s. Hashing path only...", path)
hasher.update(b"?")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not completely following this error state handling, why doesn't this stop the process since it can't read the file? Wouldn't that indicate that the recipe is faulty?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't know what kind of files a user will have in that directory. They might point path to something containing a device file or who knows what. Not really common practice, but that doesn't mean that their source is invalid or that we can't verify that the other stuff is actually the same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made this error out because it's essentially a file we can't verify, and we don't know what it might be hiding. If it causes errors, users can deliberately skip it via the skip parameter.

hasher.update(b"-")
return hasher.hexdigest()


def write_bat_activation_text(file_handle, m):
from .os_utils.external import find_executable

Expand Down
14 changes: 14 additions & 0 deletions docs/source/resources/define-metadata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,20 @@ the repository. Using path allows you to build packages with
unstaged and uncommitted changes in the working directory.
git_url can build only up to the latest commit.

Hashes
------

Conda-build can check the integrity of the provided sources
using different hashing algorithms:

- ``md5``, ``sha1`` and ``sha256`` will check the provided
hexdigest against the downloaded archive, prior to extraction.
- ``content_md5``, ``content_sha1`` and ``content_sha256`` will
check the provided hexdigest against the contents of the
(extracted) directory. ``content_hash_skip`` can take a list of
relative files and directories to be ignored during the check
(e.g. useful to ignore the ``.git/`` directory when ``git_url``
is used to clone a repository).

Patches
-------
Expand Down
20 changes: 20 additions & 0 deletions news/5277-content-hash
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
### Enhancements

* Add new hashing methods (`content_sha256`, `content_sha384`, `content_sha512`) to calculate the
checksum of the extracted contents of the downloaded source artifacts. (#4821 via #5277)

### Bug fixes

* <news item>

### Deprecations

* <news item>

### Docs

* <news item>

### Other

* <news item>
3 changes: 2 additions & 1 deletion tests/test-recipes/metadata/source_url/bld.bat
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cd constructor-tar-gz
set PYTHONPATH=.
python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
python -c "import constructor; assert constructor.__version__ == '3.0.0'"
if errorlevel 1 exit 1
3 changes: 2 additions & 1 deletion tests/test-recipes/metadata/source_url/build.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
cd constructor-tar-gz
# Not sure how versioneer comes up with this version
PYTHONPATH=. python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
PYTHONPATH=. python -c "import constructor; assert constructor.__version__ == '3.0.0'"
48 changes: 40 additions & 8 deletions tests/test-recipes/metadata/source_url/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,46 @@ package:
version: 1.0

source:
fn: conda-build-1.8.1.tar.gz
url: https://github.com/conda/conda-build/archive/1.8.1.tar.gz
md5: 0bf1f3598a659a0e8fb5ee6bbb3fd9fd
sha1: c464a8995ad6bbf0480abd2883876cc9b4913fa7
sha224: 96d76b37dcc8c28577ae5776e3aa3eb3b057af60983ce4005bbd3d61
sha256: f82b0bd5c809c9a7c7256c26364a0065e57732788b7a74c7ea2169135ed2f598
sha384: 5b407afd0c41028bd2443b13a1e34092053452db9e1872eed427f9a6042626837d16f0268568e8e54a07d173a3c80e6b
sha512: 6bf7d22fc65c111402d22fd27bbcf7ea36eec03efded33ece74a54bf30f0cb87b6f31b789276e990dd6e9e41ac78739eac8c6b2954144096ccf0bb2ec1cc4fd7
# Same code, but the tarball is generated by GH on the fly
- fn: constructor-3.0.0.tar.gz
folder: constructor-tar-gz
url: https://github.com/conda/constructor/archive/3.0.0.tar.gz
md5: 999ba62a678fd65d2be8c8c9160dff35
sha1: 0e6df9cfb04e99a8899cc1105d9c09e22811b146
sha224: 23d92c20ca12068181d9547dcfc5cbcec2bbb6e2b029d7aa5b91d6f2
sha256: a1932d36ac8ea0dcc3a0b7848a090aedc9247d4bcd75fa75e1771c2b2b01f9ff
sha384: d366de5e995a4ff6ad9266774e483efb91d9c291c0487c5cf0af055a7b48fd58af205c9455a5b2f654d92d7f3f39ef68
sha512: 33d2c8f8189f0fe8528bef0c32e62a3acd4362285e447680e7f0af16137df9ab45bf12b6928bdaaf99b5a53e71db4d385a0c1d91bdc0b2ad1d0b1a7bc6d790f1
content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
content_hash_skip:
- constructor/_version.py
# This is the same tarball but compressed differently. They should have the same content hashes!
- fn: constructor-3.0.0.zip
folder: constructor-zip
url: https://github.com/conda/constructor/archive/3.0.0.zip
md5: e4f6e0dd740e50fdb86bd5dae4d547c2
sha1: 75138e708ebfe818138dae23b6763890c1972d25
sha224: 2e79a3546798732f5f9463698cd99ae39c16eb37083b1ed50bb6cd12
sha256: 77406614899f5c2e21e2133a774b8470ba75a86e76dda799c2b39bcbce860955
sha384: e93d217376c86ab374be93c44fa03b05673e23de78033812a8f0620ce1ca6a4082fedd8b2599341ffd8dcfd201479ff4
sha512: 23e2ef512e43cb3b75637650901d5c86e0edc812a95fe85b19b45feddabe74bd72d6affac30b133c37a69046b3e27635a84107df5f64e403e1b21dc8f56ceedb
content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
content_hash_skip:
- constructor/_version.py
# This is the same tag as above, but cloned directly. They should have the same content hashes!
- folder: constructor-git
git_url: https://github.com/conda/constructor.git
git_rev: "3.0.0"
content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
content_hash_skip:
- .git/
- constructor/_version.py

requirements:
build:
Expand Down
Loading