diff --git a/.github/workflows/generate-plots.yml b/.github/workflows/generate-plots.yml index 1604c3eb673fe..e7dffbcb28180 100644 --- a/.github/workflows/generate-plots.yml +++ b/.github/workflows/generate-plots.yml @@ -12,7 +12,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: recursive @@ -34,6 +34,7 @@ jobs: export PYTHONPATH="$( pwd )/src" python -m build_plots.plot_cdf python -m build_plots.plot_versions + python -m build_plots.plot_platforms - name: Add changes, commit run: | diff --git a/.github/workflows/test-plots.yml b/.github/workflows/test-plots.yml index 07a2d436e64c2..b83be9f225726 100644 --- a/.github/workflows/test-plots.yml +++ b/.github/workflows/test-plots.yml @@ -10,7 +10,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: recursive @@ -29,3 +29,4 @@ jobs: export PYTHONPATH="$( pwd )/src" python -m build_plots.plot_cdf python -m build_plots.plot_versions 5 100 + python -m build_plots.plot_platforms 5 100 diff --git a/resources/platforms.vl.json b/resources/platforms.vl.json new file mode 100644 index 0000000000000..f813d238d37f3 --- /dev/null +++ b/resources/platforms.vl.json @@ -0,0 +1,38 @@ +{ + "$schema":"https://vega.github.io/schema/vega-lite/v5.json", + "title": "Downloads by Platform", + "description": "Downloads of platforms of a bioconda package.", + "data":{ + "values":[] + }, + "width":"container", + "mark": "bar", + "encoding":{ + "x":{ + "field":"date", + "type":"ordinal", + "timeUnit":"yearmonthdate", + "title":"date", + "axis":{ + "labelAngle":-15 + } + }, + "y":{ + "field":"delta", + "type":"quantitative", + "title":"downloads" + }, + "color":{ + "field":"subdir", + "title": "platform", + "type":"nominal", + "scale":{ + "scheme": "paired", + "domain": ["linux-64", "linux-aarch64", "osx-64", "osx-arm64", "noarch"] + } + }, + "tooltip":{ + "field":"delta" + } + } +} diff --git a/resources/versions.vl.json b/resources/versions.vl.json index 1db92267addf5..944ebe0149065 100644 --- a/resources/versions.vl.json +++ b/resources/versions.vl.json @@ -1,5 +1,6 @@ { "$schema":"https://vega.github.io/schema/vega-lite/v5.json", + "title": "Downloads by Version", "description": "Downloads of various versions of a bioconda package.", "data":{ "values":[] diff --git a/src/build_plots/common.py b/src/build_plots/common.py new file mode 100644 index 0000000000000..14d488c183b13 --- /dev/null +++ b/src/build_plots/common.py @@ -0,0 +1,112 @@ +import fnmatch +import io +import os +import pandas as pd +from git import Repo +from logging import INFO, basicConfig, getLogger +from ._vendor.conda.models.version import VersionOrder + + +basicConfig(level=INFO) +logger = getLogger(__name__) + + +def buildDailyPlot(category, field, max_packages, days_to_plot): + os.makedirs("plots", exist_ok=True) + + repo = Repo("bioconda-stats") + tags = repo.tags + + # for each package, get the most recent versions.tsv + package_count = 0 + error_count = 0 + for filename in fnmatch.filter( + os.listdir( + f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}" + ), + "*.tsv", + ): + package = filename[:-4] + try: + logger.debug(f"Loading data for package: {package}") + package_df = pd.DataFrame() + tagref = None + + df = pd.read_csv( + f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}/{filename}", + dtype={field: str, "total": int}, + encoding="utf-8", + sep="\t", + ) + versions = set(df[field]) + prev_tagname = tags[len(tags) - 1].name + + # Get tags going back 15 days (or as specified in arg) + for days_back in range(1, days_to_plot): + if tagref is not None: + prev_tagname = tagref.name + tagref = tags[len(tags) - 1 - days_back] + + # Get a previous tagged version of the package stats tsv + try: + subtree = ( + tagref.commit.tree + / f"package-downloads/anaconda.org/bioconda/{category}" + ) + blob = subtree / filename + except KeyError: + # does not exist + break + + logger.debug(f"Found data for {package} from date {tagref.name}.") + new_df = pd.read_csv( + io.BytesIO(blob.data_stream.read()), + dtype={field: str, "total": int}, + encoding="utf-8", + sep="\t", + ) + # do a delta between totals of different dates + versions = versions | set(new_df[field]) + df_sub = df.set_index(field).subtract( + new_df.set_index(field), fill_value=0 + ) + df_sub.rename(columns={"total": "delta"}, inplace=True) + df = df.merge(df_sub, on=field) + df["date"] = prev_tagname + package_df = pd.concat([package_df, df], ignore_index=True) + df = new_df + + if len(package_df.index) > 0: + version_list = list(versions) + # Get 7 most recent versions, sorting by VersionOrder + if category == "versions": + version_list = sorted(version_list, key=VersionOrder)[-7:] + package_df[field] = pd.Categorical( + package_df[field], ordered=True, categories=version_list + ) + package_df = package_df[package_df[field].notna()].sort_values( + by=[field, "date"] + )[["date", "total", "delta", field]] + + # Save plot data + os.makedirs(f"plots/{package}", exist_ok=True) + with open(f"plots/{package}/{category}.json", "w") as v: + v.writelines(package_df.to_json(orient="records")) + logger.debug(f"Saved data for {package} to {category}.json.") + + except Exception as e: + # Log package name and continue with the rest + error_count += 1 + e.args = (f"Error creating plot for {package}.",) + e.args + logger.exception(e) + + package_count += 1 + if max_packages and package_count == max_packages: + break + + if error_count > 0: + raise RuntimeError( + f"Errors occurred for {error_count} out of {package_count} packages." + ) + else: + logger.info(f"Completed {package_count} packages.") diff --git a/src/build_plots/plot_platforms.py b/src/build_plots/plot_platforms.py new file mode 100644 index 0000000000000..c6eb2938c2321 --- /dev/null +++ b/src/build_plots/plot_platforms.py @@ -0,0 +1,22 @@ +import io +import os +import pandas as pd +import sys +from git import Repo +from logging import INFO, basicConfig, getLogger +from .common import buildDailyPlot + + +basicConfig(level=INFO) +logger = getLogger(__name__) + +days_to_plot = 15 +if len(sys.argv) > 1 and sys.argv[1]: + # Add 1 to get the delta for the last date + days_to_plot = int(sys.argv[1]) + 1 + +max_packages = None +if len(sys.argv) > 2 and sys.argv[2]: + max_packages = int(sys.argv[2]) + +buildDailyPlot("platforms", "subdir", max_packages, days_to_plot) diff --git a/src/build_plots/plot_versions.py b/src/build_plots/plot_versions.py index 876bf677008f0..ba0c7d89ad7b3 100644 --- a/src/build_plots/plot_versions.py +++ b/src/build_plots/plot_versions.py @@ -4,7 +4,7 @@ import sys from git import Repo from logging import INFO, basicConfig, getLogger -from ._vendor.conda.models.version import VersionOrder +from .common import buildDailyPlot basicConfig(level=INFO) @@ -19,100 +19,4 @@ if len(sys.argv) > 2 and sys.argv[2]: max_packages = int(sys.argv[2]) -if not os.path.exists("plots"): - os.makedirs("plots") - -repo = Repo("bioconda-stats") -tags = repo.tags - -# for each package, get the most recent versions.tsv -package_count = 0 -error_count = 0 -for filename in os.listdir( - "bioconda-stats/package-downloads/anaconda.org/bioconda/versions" -): - if filename.endswith(".tsv"): - package = filename[:-4] - try: - logger.debug(f"Loading data for package: {package}") - package_df = pd.DataFrame() - tagref = None - - df = pd.read_csv( - f"bioconda-stats/package-downloads/anaconda.org/bioconda/versions/{filename}", - dtype={ "version": str, "total": int }, - encoding="utf-8", - sep="\t", - ) - versions = set(df["version"]) - prev_tagname = tags[len(tags) - 1].name - - # Get tags going back 15 days (or as specified in arg) - for days_back in range(1, days_to_plot): - if tagref is not None: - prev_tagname = tagref.name - tagref = tags[len(tags) - 1 - days_back] - subtree = ( - tagref.commit.tree - / "package-downloads/anaconda.org/bioconda/versions" - ) - - # Get a previous tagged version of the package stats tsv - try: - blob = subtree / filename - except KeyError: - # does not exist - break - - logger.debug(f"Found data for {package} from date {tagref.name}.") - new_df = pd.read_csv( - io.BytesIO(blob.data_stream.read()), - dtype={ "version": str, "total": int }, - encoding="utf-8", - sep="\t" - ) - # do a delta between totals of different dates - versions = versions | set(new_df["version"]) - df_sub = df.set_index("version").subtract( - new_df.set_index("version"), fill_value=0 - ) - df_sub.rename(columns={"total": "delta"}, inplace=True) - df = df.merge(df_sub, on="version") - df["date"] = prev_tagname - package_df = pd.concat([package_df, df], ignore_index=True) - df = new_df - - if len(package_df.index) > 0: - # Get 7 most recent versions, sorting by VersionOrder - version_list = sorted(list(versions), key=VersionOrder)[-7:] - package_df["version"] = pd.Categorical( - package_df["version"], ordered=True, categories=version_list - ) - package_df = package_df[package_df["version"].notna()].sort_values( - by=["version", "date"] - )[["date", "total", "delta", "version"]] - - # Save plot data - if not os.path.exists(f"plots/{package}"): - os.makedirs(f"plots/{package}") - with open(f"plots/{package}/versions.json", "w") as v: - v.writelines(package_df.to_json(orient="records")) - logger.debug(f"Saved data for {package} to versions.json.") - - except Exception as e: - # Log package name and continue with the rest - error_count += 1 - e.args = (f"Error creating plot for {package}.",) + e.args - logger.exception(e) - - finally: - package_count += 1 - if max_packages and package_count == max_packages: - break - -if error_count > 0: - raise RuntimeError( - f"Errors occurred for {error_count} out of {package_count} packages." - ) -else: - logger.info(f"Completed {package_count} packages.") +buildDailyPlot("versions", "version", max_packages, days_to_plot)