conda · jaimergp · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 13, 2024
@@ -570,6 +570,10 @@ def parse(data, config, path=None):
         "sha256": None,
         "sha384": None,
         "sha512": None,
+        "content_sha256": None,
+        "content_sha384": None,
+        "content_sha512": None,
+        "content_hash_skip": list,
         "path": str,
         "path_via_symlink": None,
         "git_url": str,

@@ -27,6 +27,7 @@
     LoggingContext,
     check_call_env,
     check_output_env,
+    compute_content_hash,
     convert_path_for_cygwin_or_msys2,
     convert_unix_path_to_win,
     copy_into,
@@ -46,9 +47,8 @@
 
 git_submod_re = re.compile(r"(?:.+)\.(.+)\.(?:.+)\s(.+)")
 ext_re = re.compile(r"(.*?)(\.(?:tar\.)?[^.]+)$")
-
-
 ACCEPTED_HASH_TYPES = ("md5", "sha1", "sha224", "sha256", "sha384", "sha512")
+CONTENT_HASH_KEYS = ("content_sha256", "content_sha384", "content_sha512")
 
 
 def append_hash_to_fn(fn, hash_value):
@@ -78,9 +78,10 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
         break
     else:
         log.warning(
-            f"No hash {ACCEPTED_HASH_TYPES} provided for {unhashed_fn}.  Source download forced.  "
+            f"No hash {ACCEPTED_HASH_TYPES} provided for {unhashed_fn}. Source download forced. "
             "Add hash to recipe to use source cache."
         )
+
     path = join(cache_folder, fn)
     if isfile(path):
         if verbose:
@@ -119,15 +120,15 @@ def download_to_cache(cache_folder, recipe_path, source_dict, verbose=False):
             raise RuntimeError(f"Could not download {url}")
 
     hashed = None
-
     for hash_type in set(source_dict).intersection(ACCEPTED_HASH_TYPES):
         if hash_type in source_dict:
             expected_hash = source_dict[hash_type]
             hashed = compute_sum(path, hash_type)
             if expected_hash != hashed:
                 rm_rf(path)
                 raise RuntimeError(
-                    f"{hash_type.upper()} mismatch: '{hashed}' != '{expected_hash}'"
+                    f"{hash_type.upper()} mismatch for {unhashed_fn}: "
+                    f"obtained '{hashed}' != expected '{expected_hash}'"
                 )
 
     # this is really a fallback.  If people don't provide the hash, we still need to prevent
@@ -1033,7 +1034,7 @@ def provide(metadata):
     git = None
 
     try:
-        for source_dict in metadata.get_section("source"):
+        for idx, source_dict in enumerate(metadata.get_section("source")):
             folder = source_dict.get("folder")
             src_dir = os.path.join(metadata.config.work_dir, folder if folder else "")
             if any(k in source_dict for k in ("fn", "url")):
@@ -1112,6 +1113,25 @@ def provide(metadata):
                 if not isdir(src_dir):
                     os.makedirs(src_dir)
 
+            for hash_type in CONTENT_HASH_KEYS:
+                if hash_type in source_dict:
+                    expected_content_hash = source_dict[hash_type]
+                    if expected_content_hash in (None, ""):
+                        raise ValueError(
+                            f"Empty {hash_type} hash provided for source item #{idx}"
+                        )
+                    algorithm = hash_type[len("content_") :]
+                    obtained_content_hash = compute_content_hash(
+                        src_dir,
+                        algorithm,
+                        skip=ensure_list(source_dict.get("content_hash_skip") or ()),
+                    )
+                    if expected_content_hash != obtained_content_hash:
+                        raise RuntimeError(
+                            f"{hash_type} mismatch in source item #{idx}: "
+                            f"obtained '{obtained_content_hash}' != "
+                            f"expected '{expected_content_hash}'"
+                        )
             patches = ensure_list(source_dict.get("patches", []))
             patch_attributes_output = []
             for patch in patches:

@@ -22,7 +22,7 @@
 import urllib.request as urllib
 from collections import OrderedDict, defaultdict
 from collections.abc import Iterable
-from functools import cache
+from functools import cache, partial
 from glob import glob
 from io import StringIO
 from itertools import filterfalse
@@ -1987,6 +1987,100 @@ def sha256_checksum(filename, buffersize=65536):
     return sha256.hexdigest()
 
 
+def compute_content_hash(
+    directory: str | Path, algorithm="sha256", skip: Iterable[str] = ()
+) -> str:
+    """
+    Given a directory, recursively scan all its contents (without following symlinks) and sort them
+    by their full path. For each entry in the contents table, compute the hash for the concatenated
+    bytes of:
+
+    - UTF-8 encoded path, relative to the input directory. Backslashes are normalized
+      to forward slashes before encoding.
+    - Then, depending on the type:
+        - For regular files, the UTF-8 bytes of an `F` separator, followed by:
+          - UTF-8 bytes of the line-ending normalized text (`\r\n` to `\n`), if the file is text.
+          - The raw bytes of the file contents, if binary.
+          - Note: If the file can't be opened or read, no contents are hashed;
+            it's treated as empty.
+        - For a directory, the UTF-8 bytes of a `D` separator, and nothing else.
+        - For a symlink, the UTF-8 bytes of an `L` separator, followed by the UTF-8 encoded bytes
+          for the path it points to. Backslashes MUST be normalized to forward slashes before
+          encoding.
+        - For any other types, the UTF-8 bytes of a `?` separator, and nothing else.
+    - UTF-8 encoded bytes of the string `-`, as separator.
+
+    Parameters
+    ----------
+    directory: The path whose contents will be hashed
+    algorithm: Name of the algorithm to be used, as expected by `hashlib.new()`
+    skip: iterable of paths that should not be checked. If a path ends with a slash, it's
+          interpreted as a directory that won't be traversed. It matches the relative paths
+          already slashed-normalized (i.e. backwards slashes replaced with forward slashes).
+
+    Returns
+    -------
+    str
+        The hexdigest of the computed hash, as described above.
+    """
+    log = get_logger(__name__)
+    hasher = hashlib.new(algorithm)
+    for path in sorted(Path(directory).rglob("*"), key=str):
+        relpath = path.relative_to(directory)
+        relpathstr = str(relpath).replace("\\", "/")
+        if skip and any(
+            (
+                # Skip directories like .git/
+                skip_item.endswith("/")
+                and relpathstr.startswith(skip_item)
+                or f"{relpathstr}/" == skip_item
+            )
+            # Skip full relpath match
+            or relpathstr == skip_item
+            for skip_item in skip
+        ):
+            continue
+        # encode the relative path to directory, for files, dirs and others
+        hasher.update(relpathstr.encode("utf-8"))
+        if path.is_symlink():
+            hasher.update(b"L")
+            hasher.update(str(path.readlink()).replace("\\", "/").encode("utf-8"))
+        elif path.is_dir():
+            hasher.update(b"D")
+        elif path.is_file():
+            hasher.update(b"F")
+            # We need to normalize line endings for Windows-Unix compat
+            # Attempt normalized line-by-line hashing (text mode). If
+            # Python fails to open in text mode, then it's binary and we hash
+            # the raw bytes directly.
+            try:
+                try:
+                    lines = []
+                    with open(path) as fh:
+                        for line in fh:
+                            # Accumulate all line-ending normalized lines first
+                            # to make sure the whole file is read. This prevents
+                            # partial updates to the hash with hybrid text/binary
+                            # files (e.g. like the constructor shell installers).
+                            lines.append(line.replace("\r\n", "\n"))
+                    for line in lines:
+                        hasher.update(line.encode("utf-8"))
+                except UnicodeDecodeError:
+                    # file must be binary, read the bytes directly
+                    with open(path, "rb") as fh:
+                        for chunk in iter(partial(fh.read, 8192), b""):
+                            hasher.update(chunk)
+            except OSError as exc:
+                log.warning(
+                    "Can't open file %s. Hashing path only...", path.name, exc_info=exc
+                )
+        else:
+            log.warning("Can't detect type for path %s. Hashing path only...", path)
+            hasher.update(b"?")
+        hasher.update(b"-")
+    return hasher.hexdigest()
+
+
 def write_bat_activation_text(file_handle, m):
     from .os_utils.external import find_executable
 

@@ -210,6 +210,20 @@ the repository. Using path allows you to build packages with
 unstaged and uncommitted changes in the working directory.
 git_url can build only up to the latest commit.
 
+Hashes
+------
+
+Conda-build can check the integrity of the provided sources
+using different hashing algorithms:
+
+- ``md5``, ``sha1`` and ``sha256`` will check the provided
+  hexdigest against the downloaded archive, prior to extraction.
+- ``content_md5``, ``content_sha1`` and ``content_sha256`` will
+  check the provided hexdigest against the contents of the
+  (extracted) directory. ``content_hash_skip`` can take a list of
+  relative files and directories to be ignored during the check
+  (e.g. useful to ignore the ``.git/`` directory when ``git_url``
+  is used to clone a repository).
 
 Patches
 -------

@@ -0,0 +1,20 @@
+### Enhancements
+
+* Add new hashing methods (`content_sha256`, `content_sha384`, `content_sha512`) to calculate the
+  checksum of the extracted contents of the downloaded source artifacts. (#4821 via #5277)
+
+### Bug fixes
+
+* <news item>
+
+### Deprecations
+
+* <news item>
+
+### Docs
+
+* <news item>
+
+### Other
+
+* <news item>
@@ -1,3 +1,4 @@
+cd constructor-tar-gz
 set PYTHONPATH=.
-python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
+python -c "import constructor; assert constructor.__version__ == '3.0.0'"
 if errorlevel 1 exit 1
@@ -1,2 +1,3 @@
+cd constructor-tar-gz
 # Not sure how versioneer comes up with this version
-PYTHONPATH=. python -c "import conda_build; assert conda_build.__version__ == 'tag: 1.8.1'"
+PYTHONPATH=. python -c "import constructor; assert constructor.__version__ == '3.0.0'"
@@ -3,14 +3,46 @@ package:
   version: 1.0
 
 source:
-  fn: conda-build-1.8.1.tar.gz
-  url: https://github.com/conda/conda-build/archive/1.8.1.tar.gz
-  md5: 0bf1f3598a659a0e8fb5ee6bbb3fd9fd
-  sha1: c464a8995ad6bbf0480abd2883876cc9b4913fa7
-  sha224: 96d76b37dcc8c28577ae5776e3aa3eb3b057af60983ce4005bbd3d61
-  sha256: f82b0bd5c809c9a7c7256c26364a0065e57732788b7a74c7ea2169135ed2f598
-  sha384: 5b407afd0c41028bd2443b13a1e34092053452db9e1872eed427f9a6042626837d16f0268568e8e54a07d173a3c80e6b
-  sha512: 6bf7d22fc65c111402d22fd27bbcf7ea36eec03efded33ece74a54bf30f0cb87b6f31b789276e990dd6e9e41ac78739eac8c6b2954144096ccf0bb2ec1cc4fd7
+  # Same code, but the tarball is generated by GH on the fly
+  - fn: constructor-3.0.0.tar.gz
+    folder: constructor-tar-gz
+    url: https://github.com/conda/constructor/archive/3.0.0.tar.gz
+    md5: 999ba62a678fd65d2be8c8c9160dff35
+    sha1: 0e6df9cfb04e99a8899cc1105d9c09e22811b146
+    sha224: 23d92c20ca12068181d9547dcfc5cbcec2bbb6e2b029d7aa5b91d6f2
+    sha256: a1932d36ac8ea0dcc3a0b7848a090aedc9247d4bcd75fa75e1771c2b2b01f9ff
+    sha384: d366de5e995a4ff6ad9266774e483efb91d9c291c0487c5cf0af055a7b48fd58af205c9455a5b2f654d92d7f3f39ef68
+    sha512: 33d2c8f8189f0fe8528bef0c32e62a3acd4362285e447680e7f0af16137df9ab45bf12b6928bdaaf99b5a53e71db4d385a0c1d91bdc0b2ad1d0b1a7bc6d790f1
+    content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
+    content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
+    content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
+    content_hash_skip:
+      - constructor/_version.py
+  # This is the same tarball but compressed differently. They should have the same content hashes!
+  - fn: constructor-3.0.0.zip
+    folder: constructor-zip
+    url: https://github.com/conda/constructor/archive/3.0.0.zip
+    md5: e4f6e0dd740e50fdb86bd5dae4d547c2
+    sha1: 75138e708ebfe818138dae23b6763890c1972d25
+    sha224: 2e79a3546798732f5f9463698cd99ae39c16eb37083b1ed50bb6cd12
+    sha256: 77406614899f5c2e21e2133a774b8470ba75a86e76dda799c2b39bcbce860955
+    sha384: e93d217376c86ab374be93c44fa03b05673e23de78033812a8f0620ce1ca6a4082fedd8b2599341ffd8dcfd201479ff4
+    sha512: 23e2ef512e43cb3b75637650901d5c86e0edc812a95fe85b19b45feddabe74bd72d6affac30b133c37a69046b3e27635a84107df5f64e403e1b21dc8f56ceedb
+    content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
+    content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
+    content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
+    content_hash_skip:
+      - constructor/_version.py
+  # This is the same tag as above, but cloned directly. They should have the same content hashes!
+  - folder: constructor-git
+    git_url: https://github.com/conda/constructor.git
+    git_rev: "3.0.0"
+    content_sha256: a884ace5aa3a7e7f5a8b5adeb5cbfa7209f2ae88134d362c8bbca9c82ad2bb06
+    content_sha384: 3644cb7e55fb8f6d7328b19da3ec46be6af1e67291cc48948687cf9493d9b2caea3b5a637d1dfc1a19dd2893ddc38d27
+    content_sha512: 79a0c5edc29f979b599f0b694c3f0f07cc91e590c2c3fcb9c3f965767bf5a22fe634f0f142c626ef0859249d0242f3d8cc93922cf14e7ba527eedc3e8c8b354e
+    content_hash_skip:
+      - .git/
+      - constructor/_version.py
 
 requirements:
   build: