Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Downloads by Platform plots #8

Merged
merged 10 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/generate-plots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: recursive

Expand All @@ -34,6 +34,7 @@ jobs:
export PYTHONPATH="$( pwd )/src"
python -m build_plots.plot_cdf
python -m build_plots.plot_versions
python -m build_plots.plot_platforms

- name: Add changes, commit
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test-plots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: recursive

Expand All @@ -29,3 +29,4 @@ jobs:
export PYTHONPATH="$( pwd )/src"
python -m build_plots.plot_cdf
python -m build_plots.plot_versions 5 100
python -m build_plots.plot_platforms 5 100
38 changes: 38 additions & 0 deletions resources/platforms.vl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"$schema":"https://vega.github.io/schema/vega-lite/v5.json",
"title": "Downloads by Platform",
"description": "Downloads of platforms of a bioconda package.",
"data":{
"values":[]
},
"width":"container",
"mark": "bar",
"encoding":{
"x":{
"field":"date",
"type":"ordinal",
"timeUnit":"yearmonthdate",
"title":"date",
"axis":{
"labelAngle":-15
}
},
"y":{
"field":"delta",
"type":"quantitative",
"title":"downloads"
},
"color":{
"field":"subdir",
"title": "platform",
"type":"nominal",
"scale":{
"scheme": "paired",
"domain": ["linux-64", "linux-aarch64", "osx-64", "osx-arm64", "noarch"]
}
},
"tooltip":{
"field":"delta"
}
}
}
1 change: 1 addition & 0 deletions resources/versions.vl.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"$schema":"https://vega.github.io/schema/vega-lite/v5.json",
"title": "Downloads by Version",
"description": "Downloads of various versions of a bioconda package.",
"data":{
"values":[]
Expand Down
112 changes: 112 additions & 0 deletions src/build_plots/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import fnmatch
import io
import os
import pandas as pd
from git import Repo
from logging import INFO, basicConfig, getLogger
from ._vendor.conda.models.version import VersionOrder


basicConfig(level=INFO)
logger = getLogger(__name__)


def buildDailyPlot(category, field, max_packages, days_to_plot):
os.makedirs("plots", exist_ok=True)

repo = Repo("bioconda-stats")
tags = repo.tags

# for each package, get the most recent versions.tsv
package_count = 0
error_count = 0
for filename in fnmatch.filter(
os.listdir(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}"
),
"*.tsv",
):
package = filename[:-4]
try:
logger.debug(f"Loading data for package: {package}")
package_df = pd.DataFrame()
tagref = None

df = pd.read_csv(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}/{filename}",
dtype={field: str, "total": int},
encoding="utf-8",
sep="\t",
)
versions = set(df[field])
prev_tagname = tags[len(tags) - 1].name

# Get tags going back 15 days (or as specified in arg)
for days_back in range(1, days_to_plot):
if tagref is not None:
prev_tagname = tagref.name
tagref = tags[len(tags) - 1 - days_back]

# Get a previous tagged version of the package stats tsv
try:
subtree = (
tagref.commit.tree
/ f"package-downloads/anaconda.org/bioconda/{category}"
)
blob = subtree / filename
except KeyError:
# does not exist
break

logger.debug(f"Found data for {package} from date {tagref.name}.")
new_df = pd.read_csv(
io.BytesIO(blob.data_stream.read()),
dtype={field: str, "total": int},
encoding="utf-8",
sep="\t",
)
# do a delta between totals of different dates
versions = versions | set(new_df[field])
df_sub = df.set_index(field).subtract(
new_df.set_index(field), fill_value=0
)
df_sub.rename(columns={"total": "delta"}, inplace=True)
df = df.merge(df_sub, on=field)
df["date"] = prev_tagname
package_df = pd.concat([package_df, df], ignore_index=True)
df = new_df

if len(package_df.index) > 0:
version_list = list(versions)
# Get 7 most recent versions, sorting by VersionOrder
if category == "versions":
version_list = sorted(version_list, key=VersionOrder)[-7:]
package_df[field] = pd.Categorical(
package_df[field], ordered=True, categories=version_list
)
package_df = package_df[package_df[field].notna()].sort_values(
by=[field, "date"]
)[["date", "total", "delta", field]]

# Save plot data
os.makedirs(f"plots/{package}", exist_ok=True)
with open(f"plots/{package}/{category}.json", "w") as v:
v.writelines(package_df.to_json(orient="records"))
logger.debug(f"Saved data for {package} to {category}.json.")

except Exception as e:
# Log package name and continue with the rest
error_count += 1
e.args = (f"Error creating plot for {package}.",) + e.args
logger.exception(e)

package_count += 1
if max_packages and package_count == max_packages:
break

if error_count > 0:
raise RuntimeError(
f"Errors occurred for {error_count} out of {package_count} packages."
)
else:
logger.info(f"Completed {package_count} packages.")
22 changes: 22 additions & 0 deletions src/build_plots/plot_platforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import io
import os
import pandas as pd
import sys
from git import Repo
from logging import INFO, basicConfig, getLogger
from .common import buildDailyPlot


basicConfig(level=INFO)
logger = getLogger(__name__)

days_to_plot = 15
if len(sys.argv) > 1 and sys.argv[1]:
# Add 1 to get the delta for the last date
days_to_plot = int(sys.argv[1]) + 1

max_packages = None
if len(sys.argv) > 2 and sys.argv[2]:
max_packages = int(sys.argv[2])

buildDailyPlot("platforms", "subdir", max_packages, days_to_plot)
100 changes: 2 additions & 98 deletions src/build_plots/plot_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from git import Repo
from logging import INFO, basicConfig, getLogger
from ._vendor.conda.models.version import VersionOrder
from .common import buildDailyPlot


basicConfig(level=INFO)
Expand All @@ -19,100 +19,4 @@
if len(sys.argv) > 2 and sys.argv[2]:
max_packages = int(sys.argv[2])

if not os.path.exists("plots"):
os.makedirs("plots")

repo = Repo("bioconda-stats")
tags = repo.tags

# for each package, get the most recent versions.tsv
package_count = 0
error_count = 0
for filename in os.listdir(
"bioconda-stats/package-downloads/anaconda.org/bioconda/versions"
):
if filename.endswith(".tsv"):
package = filename[:-4]
try:
logger.debug(f"Loading data for package: {package}")
package_df = pd.DataFrame()
tagref = None

df = pd.read_csv(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/versions/{filename}",
dtype={ "version": str, "total": int },
encoding="utf-8",
sep="\t",
)
versions = set(df["version"])
prev_tagname = tags[len(tags) - 1].name

# Get tags going back 15 days (or as specified in arg)
for days_back in range(1, days_to_plot):
if tagref is not None:
prev_tagname = tagref.name
tagref = tags[len(tags) - 1 - days_back]
subtree = (
tagref.commit.tree
/ "package-downloads/anaconda.org/bioconda/versions"
)

# Get a previous tagged version of the package stats tsv
try:
blob = subtree / filename
except KeyError:
# does not exist
break

logger.debug(f"Found data for {package} from date {tagref.name}.")
new_df = pd.read_csv(
io.BytesIO(blob.data_stream.read()),
dtype={ "version": str, "total": int },
encoding="utf-8",
sep="\t"
)
# do a delta between totals of different dates
versions = versions | set(new_df["version"])
df_sub = df.set_index("version").subtract(
new_df.set_index("version"), fill_value=0
)
df_sub.rename(columns={"total": "delta"}, inplace=True)
df = df.merge(df_sub, on="version")
df["date"] = prev_tagname
package_df = pd.concat([package_df, df], ignore_index=True)
df = new_df

if len(package_df.index) > 0:
# Get 7 most recent versions, sorting by VersionOrder
version_list = sorted(list(versions), key=VersionOrder)[-7:]
package_df["version"] = pd.Categorical(
package_df["version"], ordered=True, categories=version_list
)
package_df = package_df[package_df["version"].notna()].sort_values(
by=["version", "date"]
)[["date", "total", "delta", "version"]]

# Save plot data
if not os.path.exists(f"plots/{package}"):
os.makedirs(f"plots/{package}")
with open(f"plots/{package}/versions.json", "w") as v:
v.writelines(package_df.to_json(orient="records"))
logger.debug(f"Saved data for {package} to versions.json.")

except Exception as e:
# Log package name and continue with the rest
error_count += 1
e.args = (f"Error creating plot for {package}.",) + e.args
logger.exception(e)

finally:
package_count += 1
if max_packages and package_count == max_packages:
break

if error_count > 0:
raise RuntimeError(
f"Errors occurred for {error_count} out of {package_count} packages."
)
else:
logger.info(f"Completed {package_count} packages.")
buildDailyPlot("versions", "version", max_packages, days_to_plot)
Loading