Skip to content

Commit

Permalink
Merge pull request #8 from bioconda/platform-plots
Browse files Browse the repository at this point in the history
feat: Downloads by Platform plots
  • Loading branch information
aliciaaevans authored Jul 31, 2024
2 parents ee46ee4 + 80321a2 commit d8709eb
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 100 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/generate-plots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: recursive

Expand All @@ -34,6 +34,7 @@ jobs:
export PYTHONPATH="$( pwd )/src"
python -m build_plots.plot_cdf
python -m build_plots.plot_versions
python -m build_plots.plot_platforms
- name: Add changes, commit
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test-plots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: recursive

Expand All @@ -29,3 +29,4 @@ jobs:
export PYTHONPATH="$( pwd )/src"
python -m build_plots.plot_cdf
python -m build_plots.plot_versions 5 100
python -m build_plots.plot_platforms 5 100
38 changes: 38 additions & 0 deletions resources/platforms.vl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"$schema":"https://vega.github.io/schema/vega-lite/v5.json",
"title": "Downloads by Platform",
"description": "Downloads of platforms of a bioconda package.",
"data":{
"values":[]
},
"width":"container",
"mark": "bar",
"encoding":{
"x":{
"field":"date",
"type":"ordinal",
"timeUnit":"yearmonthdate",
"title":"date",
"axis":{
"labelAngle":-15
}
},
"y":{
"field":"delta",
"type":"quantitative",
"title":"downloads"
},
"color":{
"field":"subdir",
"title": "platform",
"type":"nominal",
"scale":{
"scheme": "paired",
"domain": ["linux-64", "linux-aarch64", "osx-64", "osx-arm64", "noarch"]
}
},
"tooltip":{
"field":"delta"
}
}
}
1 change: 1 addition & 0 deletions resources/versions.vl.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"$schema":"https://vega.github.io/schema/vega-lite/v5.json",
"title": "Downloads by Version",
"description": "Downloads of various versions of a bioconda package.",
"data":{
"values":[]
Expand Down
112 changes: 112 additions & 0 deletions src/build_plots/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import fnmatch
import io
import os
import pandas as pd
from git import Repo
from logging import INFO, basicConfig, getLogger
from ._vendor.conda.models.version import VersionOrder


basicConfig(level=INFO)
logger = getLogger(__name__)


def buildDailyPlot(category, field, max_packages, days_to_plot):
os.makedirs("plots", exist_ok=True)

repo = Repo("bioconda-stats")
tags = repo.tags

# for each package, get the most recent versions.tsv
package_count = 0
error_count = 0
for filename in fnmatch.filter(
os.listdir(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}"
),
"*.tsv",
):
package = filename[:-4]
try:
logger.debug(f"Loading data for package: {package}")
package_df = pd.DataFrame()
tagref = None

df = pd.read_csv(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/{category}/{filename}",
dtype={field: str, "total": int},
encoding="utf-8",
sep="\t",
)
versions = set(df[field])
prev_tagname = tags[len(tags) - 1].name

# Get tags going back 15 days (or as specified in arg)
for days_back in range(1, days_to_plot):
if tagref is not None:
prev_tagname = tagref.name
tagref = tags[len(tags) - 1 - days_back]

# Get a previous tagged version of the package stats tsv
try:
subtree = (
tagref.commit.tree
/ f"package-downloads/anaconda.org/bioconda/{category}"
)
blob = subtree / filename
except KeyError:
# does not exist
break

logger.debug(f"Found data for {package} from date {tagref.name}.")
new_df = pd.read_csv(
io.BytesIO(blob.data_stream.read()),
dtype={field: str, "total": int},
encoding="utf-8",
sep="\t",
)
# do a delta between totals of different dates
versions = versions | set(new_df[field])
df_sub = df.set_index(field).subtract(
new_df.set_index(field), fill_value=0
)
df_sub.rename(columns={"total": "delta"}, inplace=True)
df = df.merge(df_sub, on=field)
df["date"] = prev_tagname
package_df = pd.concat([package_df, df], ignore_index=True)
df = new_df

if len(package_df.index) > 0:
version_list = list(versions)
# Get 7 most recent versions, sorting by VersionOrder
if category == "versions":
version_list = sorted(version_list, key=VersionOrder)[-7:]
package_df[field] = pd.Categorical(
package_df[field], ordered=True, categories=version_list
)
package_df = package_df[package_df[field].notna()].sort_values(
by=[field, "date"]
)[["date", "total", "delta", field]]

# Save plot data
os.makedirs(f"plots/{package}", exist_ok=True)
with open(f"plots/{package}/{category}.json", "w") as v:
v.writelines(package_df.to_json(orient="records"))
logger.debug(f"Saved data for {package} to {category}.json.")

except Exception as e:
# Log package name and continue with the rest
error_count += 1
e.args = (f"Error creating plot for {package}.",) + e.args
logger.exception(e)

package_count += 1
if max_packages and package_count == max_packages:
break

if error_count > 0:
raise RuntimeError(
f"Errors occurred for {error_count} out of {package_count} packages."
)
else:
logger.info(f"Completed {package_count} packages.")
22 changes: 22 additions & 0 deletions src/build_plots/plot_platforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import io
import os
import pandas as pd
import sys
from git import Repo
from logging import INFO, basicConfig, getLogger
from .common import buildDailyPlot


basicConfig(level=INFO)
logger = getLogger(__name__)

days_to_plot = 15
if len(sys.argv) > 1 and sys.argv[1]:
# Add 1 to get the delta for the last date
days_to_plot = int(sys.argv[1]) + 1

max_packages = None
if len(sys.argv) > 2 and sys.argv[2]:
max_packages = int(sys.argv[2])

buildDailyPlot("platforms", "subdir", max_packages, days_to_plot)
100 changes: 2 additions & 98 deletions src/build_plots/plot_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from git import Repo
from logging import INFO, basicConfig, getLogger
from ._vendor.conda.models.version import VersionOrder
from .common import buildDailyPlot


basicConfig(level=INFO)
Expand All @@ -19,100 +19,4 @@
if len(sys.argv) > 2 and sys.argv[2]:
max_packages = int(sys.argv[2])

if not os.path.exists("plots"):
os.makedirs("plots")

repo = Repo("bioconda-stats")
tags = repo.tags

# for each package, get the most recent versions.tsv
package_count = 0
error_count = 0
for filename in os.listdir(
"bioconda-stats/package-downloads/anaconda.org/bioconda/versions"
):
if filename.endswith(".tsv"):
package = filename[:-4]
try:
logger.debug(f"Loading data for package: {package}")
package_df = pd.DataFrame()
tagref = None

df = pd.read_csv(
f"bioconda-stats/package-downloads/anaconda.org/bioconda/versions/{filename}",
dtype={ "version": str, "total": int },
encoding="utf-8",
sep="\t",
)
versions = set(df["version"])
prev_tagname = tags[len(tags) - 1].name

# Get tags going back 15 days (or as specified in arg)
for days_back in range(1, days_to_plot):
if tagref is not None:
prev_tagname = tagref.name
tagref = tags[len(tags) - 1 - days_back]
subtree = (
tagref.commit.tree
/ "package-downloads/anaconda.org/bioconda/versions"
)

# Get a previous tagged version of the package stats tsv
try:
blob = subtree / filename
except KeyError:
# does not exist
break

logger.debug(f"Found data for {package} from date {tagref.name}.")
new_df = pd.read_csv(
io.BytesIO(blob.data_stream.read()),
dtype={ "version": str, "total": int },
encoding="utf-8",
sep="\t"
)
# do a delta between totals of different dates
versions = versions | set(new_df["version"])
df_sub = df.set_index("version").subtract(
new_df.set_index("version"), fill_value=0
)
df_sub.rename(columns={"total": "delta"}, inplace=True)
df = df.merge(df_sub, on="version")
df["date"] = prev_tagname
package_df = pd.concat([package_df, df], ignore_index=True)
df = new_df

if len(package_df.index) > 0:
# Get 7 most recent versions, sorting by VersionOrder
version_list = sorted(list(versions), key=VersionOrder)[-7:]
package_df["version"] = pd.Categorical(
package_df["version"], ordered=True, categories=version_list
)
package_df = package_df[package_df["version"].notna()].sort_values(
by=["version", "date"]
)[["date", "total", "delta", "version"]]

# Save plot data
if not os.path.exists(f"plots/{package}"):
os.makedirs(f"plots/{package}")
with open(f"plots/{package}/versions.json", "w") as v:
v.writelines(package_df.to_json(orient="records"))
logger.debug(f"Saved data for {package} to versions.json.")

except Exception as e:
# Log package name and continue with the rest
error_count += 1
e.args = (f"Error creating plot for {package}.",) + e.args
logger.exception(e)

finally:
package_count += 1
if max_packages and package_count == max_packages:
break

if error_count > 0:
raise RuntimeError(
f"Errors occurred for {error_count} out of {package_count} packages."
)
else:
logger.info(f"Completed {package_count} packages.")
buildDailyPlot("versions", "version", max_packages, days_to_plot)

0 comments on commit d8709eb

Please sign in to comment.