From 2d3b70ba55f935c81d5f962bf405c96fb4b72134 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 2 Dec 2020 15:55:38 +0100 Subject: [PATCH 1/8] style --- gimmemotifs/background.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index ce9b15ff..132b931f 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -511,9 +511,15 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si int(np.sum((gc > round(b_start, 2)) & (gc <= round(b_end, 2))) * fraction) ) + # To make te requested number, divide remaining over + # all bins that have counts rest = number - sum(bin_count) - for i in range(rest): - bin_count[i] += 1 + i = 0 + for _ in range(rest): + while bin_count[i % len(bins)] == 0: + i += 1 + bin_count[i % len(bins)] += 1 + i += 1 nseqs = max(bin_count) * len(bins) @@ -533,6 +539,8 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si pass with open(bedfile, "a") as f: for (b_start, b_end), n in zip(bins, bin_count): + if n == 0: + continue # print(b_start, b_end, n) b = "{:.2f}-{:.2f}".format(b_start, b_end) df.loc[df["bin"] == b, ["chrom", "start", "end"]].sample(n).to_csv( From 3c99155ce5ae9a1047586c34126a076207150499 Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 14 Jan 2021 09:22:02 +0100 Subject: [PATCH 2/8] black --- gimmemotifs/report.py | 44 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 9515cec9..966eec44 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -16,7 +16,12 @@ import numpy as np import pandas as pd from statsmodels.stats.multitest import multipletests -from pandas.core.indexing import _non_reducing_slice + +try: + from pandas.core.indexing import non_reducing_slice +except ImportError: + from pandas.core.indexing import _non_reducing_slice as non_reducing_slice + from pandas.io.formats.style import Styler import seaborn as sns @@ -121,7 +126,7 @@ def set_font(self, font_name): def _current_index(self, subset): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) selected = self.data.loc[subset] idx_slice = pd.IndexSlice[ self.data.index.get_indexer(selected.index), @@ -154,7 +159,7 @@ def _compute_data(self): def _tooltip(self, tip, subset=None, part=None): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) if part is None: part = "data" @@ -202,7 +207,7 @@ def _wrap_iterable(self, it): def _wrap(self, subset=None, axis=0): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) if axis in [0, "columns"]: idx = self._current_index(subset)[1] @@ -228,7 +233,7 @@ def _wrap(self, subset=None, axis=0): def _convert_to_image(self, subset=None, height=30): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) self.display_data.loc[subset] = ( f'
0) and x <= 10 ** -precision: @@ -349,7 +354,7 @@ def _circle( morph=False, ): subset = pd.IndexSlice[:, :] if subset is None else subset - subslice = _non_reducing_slice(subset) + subslice = non_reducing_slice(subset) if color: palette = sns.color_palette([color]) @@ -502,7 +507,7 @@ def _emoji_scale(self, series, emojis=None, bins=None): def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) idx = self._current_index(subset=subset) @@ -515,7 +520,7 @@ def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0): def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) idx = self._current_index(subset=subset) result = self.display_data.iloc[idx].apply( @@ -527,7 +532,7 @@ def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0): def emojify(self, subset=None): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) idx = self._current_index(subset=subset) result = self.display_data.iloc[idx].applymap(emoji.emojize) @@ -547,7 +552,7 @@ def scaled_background_gradient( ): if center_zero: sub = pd.IndexSlice[:, :] if subset is None else subset - sub = _non_reducing_slice(sub) + sub = non_reducing_slice(sub) vmax = ( self.data.loc[sub] @@ -569,12 +574,7 @@ def scaled_background_gradient( vmin = -vmax r = self.background_gradient( - subset=subset, - cmap=cmap, - vmin=vmin, - vmax=vmax, - low=low, - high=high, + subset=subset, cmap=cmap, vmin=vmin, vmax=vmax, low=low, high=high, ) return r @@ -870,10 +870,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): df_styled = ( ExtraStyler(df) .set_precision(2) - .convert_to_image( - subset=["logo"], - height=30, - ) + .convert_to_image(subset=["logo"], height=30,) .scaled_background_gradient( subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75 ) @@ -992,10 +989,7 @@ def roc_html_report( if df.shape[0] > 0: f.write( ExtraStyler(df) - .convert_to_image( - subset=["logo"], - height=30, - ) + .convert_to_image(subset=["logo"], height=30,) .add_circle( subset=["% matches input", "%matches background"], vmax=100, From 60641d31da9a524d13a564a1513048b798d64d7f Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 14 Jan 2021 11:18:34 +0100 Subject: [PATCH 3/8] Warning if input regions are too small and -s 0 is used (see #167) --- gimmemotifs/background.py | 13 +++++++++---- gimmemotifs/utils.py | 12 +++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index 132b931f..96223661 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -26,6 +26,7 @@ import pandas as pd import pybedtools from genomepy import Genome +from pyarrow.lib import ArrowInvalid # GimmeMotifs imports from gimmemotifs import mytmpdir @@ -397,12 +398,13 @@ def gc_bin_bedfile( fname = os.path.join( CACHE_DIR, "{}.gcfreq.{}.feather".format(os.path.basename(genome), min_bin_size) ) - if not os.path.exists(fname): + try: + df = pd.read_feather(fname) + except (ArrowInvalid, FileNotFoundError): if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) create_gc_bin_index(genome, fname, min_bin_size=min_bin_size) - - df = pd.read_feather(fname) + df = pd.read_feather(fname) if length >= min_bin_size: col = "w{}".format( @@ -477,7 +479,10 @@ def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_si try: # pylint: disable=unexpected-keyword-arg fields = pd.read_csv(matchfile, comment="#", nrows=10, sep="\t").shape[1] - bed = pybedtools.BedTool(matchfile) + tmp = ( + pybedtools.BedTool(matchfile).filter(lambda x: len(x) >= 10).saveas().fn + ) + bed = pybedtools.BedTool(tmp) gc = np.array( [float(x[fields + 1]) for x in bed.nucleotide_content(fi=genome_fa)] ) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 1ea0439f..3df6f8b6 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -207,7 +207,17 @@ def write_equalsize_bedfile(bedfile, size, outfile): write the result to . Input file needs to be in BED or WIG format.""" if size is None or size <= 0: - copyfile(bedfile, outfile) + bed = pybedtools.BedTool(bedfile) + filtered_bed = pybedtools.BedTool( + bed.filter(lambda x: len(x) >= 10).saveas().fn + ) + + if len(bed) != len(filtered_bed): + logger.warn( + "Using original size of input file regions, however, some regions are smaller than 10nt!" + ) + logger.warn("Removing all these smaller regions.") + filtered_bed.saveas(outfile) return BUFSIZE = 10000 From 7cade5c4f0950536d70e0333323c97f46f5ea572 Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 14 Jan 2021 11:45:38 +0100 Subject: [PATCH 4/8] style --- gimmemotifs/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 3df6f8b6..302a008a 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -19,7 +19,6 @@ from functools import singledispatch from subprocess import Popen from tempfile import NamedTemporaryFile -from shutil import copyfile # External imports import pyfaidx From 3855adde09843d2acece0eb8dd08af4876b55086 Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 14 Jan 2021 13:28:12 +0100 Subject: [PATCH 5/8] update black version --- .pre-commit-config.yaml | 2 +- gimmemotifs/report.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3d5c980d..59ef0126 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ repos: - repo: https://github.com/ambv/black - rev: stable + rev: 20.8b1 hooks: - id: black diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 966eec44..089a3d29 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -574,7 +574,12 @@ def scaled_background_gradient( vmin = -vmax r = self.background_gradient( - subset=subset, cmap=cmap, vmin=vmin, vmax=vmax, low=low, high=high, + subset=subset, + cmap=cmap, + vmin=vmin, + vmax=vmax, + low=low, + high=high, ) return r @@ -870,7 +875,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): df_styled = ( ExtraStyler(df) .set_precision(2) - .convert_to_image(subset=["logo"], height=30,) + .convert_to_image( + subset=["logo"], + height=30, + ) .scaled_background_gradient( subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75 ) @@ -989,7 +997,10 @@ def roc_html_report( if df.shape[0] > 0: f.write( ExtraStyler(df) - .convert_to_image(subset=["logo"], height=30,) + .convert_to_image( + subset=["logo"], + height=30, + ) .add_circle( subset=["% matches input", "%matches background"], vmax=100, From 7fb968d5b2e3b69d7d30673d4708f084f77d97c6 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 1 Feb 2021 19:32:42 +0100 Subject: [PATCH 6/8] Fixes #170 --- gimmemotifs/motif.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index 3cedc533..c8869c6c 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -1329,7 +1329,7 @@ def format_factors( fmt_d = fmt_i = "{}" if hasattr(self, "factor_info"): - fcount = Counter([x.upper() for x in self.factor_info["Factor"]]) + fcount = Counter([x.upper() for x in self.factor_info.get("Factor", "")]) else: fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME]) From 1512af7a54cfd7ac22eca3980b292b1e5eec0531 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 1 Feb 2021 19:48:42 +0100 Subject: [PATCH 7/8] Updated CHANGELOG --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a35c519d..79e8aeb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +* `_non_reducing_slice` vs `non_reducing_slice` for pandas>=1.2 (#168) +* When using original region size, skip regions smaller than 10bp and warn if no + regions are left. +* Fixed creating statistics report crashed with `KeyError: 'Factor'` (#170) +* Fixed bug with creating GC bins for a genome with unusual GC% (like Plasmodium). +* Fixed bug that occurs when upgrading pyarrow with an existing GimmeMotifs + cache. + + ## [0.15.2] - 2020-11-26 ### Changed From d37b3527c00a2a9698ecbd492e2c32ca6371ebda Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 1 Feb 2021 19:52:35 +0100 Subject: [PATCH 8/8] fix CHANGELOGmd --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79e8aeb1..17e42e1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +## [0.15.3] - 2021-02-01 + +### Fixed + * `_non_reducing_slice` vs `non_reducing_slice` for pandas>=1.2 (#168) * When using original region size, skip regions smaller than 10bp and warn if no regions are left.