diff --git a/setup.py b/setup.py index 98287bd4fa..c1f508187e 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ "statsmodels>=0.12.2", "scikit-learn>=1.0.1", "pandas[parquet]>=1.3.5", - "numpy>=1.22.0", + "numpy>=1.22.0,<2", "nltk>=3.6.7", "scipy>=1.10.0", "requests>=2.32.0", diff --git a/src/evidently/metrics/data_quality/column_category_metric.py b/src/evidently/metrics/data_quality/column_category_metric.py index 55882763d6..39e70ffc6b 100644 --- a/src/evidently/metrics/data_quality/column_category_metric.py +++ b/src/evidently/metrics/data_quality/column_category_metric.py @@ -10,6 +10,7 @@ from evidently.base_metric import Metric from evidently.base_metric import MetricResult from evidently.core import IncludeTags +from evidently.metric_results import HistogramData from evidently.model.widget import BaseWidgetInfo from evidently.options.base import AnyOptions from evidently.renderers.base_renderer import MetricRenderer @@ -28,21 +29,48 @@ class Config: category_ratio: float +class CountOfValues(MetricResult): + current: HistogramData + reference: Optional[HistogramData] = None + + class ColumnCategoryMetricResult(MetricResult): class Config: - pd_exclude_fields = {"counts_of_values"} + pd_exclude_fields = {"counts"} field_tags = { "current": {IncludeTags.Current}, "reference": {IncludeTags.Reference}, "column_name": {IncludeTags.Parameter}, - "counts_of_values": {IncludeTags.Extra}, + "counts": {IncludeTags.Extra}, } + def __init__(self, **data): + """for backward compatibility""" + if "counts_of_values" in data: + counts_of_values: Dict[str, pd.DataFrame] = data.pop("counts_of_values") + counts = CountOfValues( + current=HistogramData(x=counts_of_values["current"]["x"], count=counts_of_values["current"]["count"]) + ) + if "reference" in counts_of_values: + counts.reference = HistogramData( + x=counts_of_values["reference"]["x"], count=counts_of_values["reference"]["count"] + ) + data["counts"] = counts + super().__init__(**data) + column_name: str category: Union[int, float, str] current: CategoryStat reference: Optional[CategoryStat] = None - counts_of_values: Dict[str, pd.DataFrame] + counts: CountOfValues + + @property + def counts_of_values(self) -> Dict[str, pd.DataFrame]: + """for backward compatibility""" + result = {"current": pd.DataFrame({"x": self.counts.current.x, "count": self.counts.current.count})} + if self.counts.reference is not None: + result["reference"] = pd.DataFrame({"x": self.counts.reference.x, "count": self.counts.reference.count}) + return result class ColumnCategoryMetric(Metric[ColumnCategoryMetricResult]): diff --git a/src/evidently/metrics/data_quality/column_value_list_metric.py b/src/evidently/metrics/data_quality/column_value_list_metric.py index 6d1f8a6867..0c99d3877b 100644 --- a/src/evidently/metrics/data_quality/column_value_list_metric.py +++ b/src/evidently/metrics/data_quality/column_value_list_metric.py @@ -10,6 +10,7 @@ from evidently.base_metric import MetricResult from evidently.calculations.data_quality import get_rows_count from evidently.core import IncludeTags +from evidently.metric_results import DistributionIncluded from evidently.model.widget import BaseWidgetInfo from evidently.options.base import AnyOptions from evidently.renderers.base_renderer import MetricRenderer @@ -25,18 +26,40 @@ class ValueListStat(MetricResult): class Config: field_tags = { - "values_in_list": {IncludeTags.Extra}, - "values_not_in_list": {IncludeTags.Extra}, + "values_in_list_dist": {IncludeTags.Extra}, + "values_not_in_list_dist": {IncludeTags.Extra}, "rows_count": {IncludeTags.Extra}, } + def __init__(self, **data: Any): + if "values_in_list" in data: + values_in_list: List[Tuple[Any, int]] = data.pop("values_in_list") + data["values_in_list_dist"] = DistributionIncluded( + x=[v[0] for v in values_in_list], y=[v[1] for v in values_in_list] + ) + if "values_not_in_list" in data: + values_not_in_list: List[Tuple[Any, int]] = data.pop("values_not_in_list") + data["values_not_in_list_dist"] = DistributionIncluded( + x=[v[0] for v in values_not_in_list], y=[v[1] for v in values_not_in_list] + ) + + super().__init__(**data) + number_in_list: int number_not_in_list: int share_in_list: float share_not_in_list: float - values_in_list: List[Tuple[Any, int]] - values_not_in_list: List[Tuple[Any, int]] rows_count: int + values_in_list_dist: DistributionIncluded + values_not_in_list_dist: DistributionIncluded + + @property + def values_in_list(self) -> List[Tuple[Any, int]]: + return [(x, y) for x, y in zip(self.values_in_list_dist.x, self.values_in_list_dist.y)] + + @property + def values_not_in_list(self) -> List[Tuple[Any, int]]: + return [(x, y) for x, y in zip(self.values_not_in_list_dist.x, self.values_not_in_list_dist.y)] class ColumnValueListMetricResult(MetricResult): diff --git a/tests/metrics/data_quality/test_column_value_list_metric.py b/tests/metrics/data_quality/test_column_value_list_metric.py index e037b742ee..93f780999d 100644 --- a/tests/metrics/data_quality/test_column_value_list_metric.py +++ b/tests/metrics/data_quality/test_column_value_list_metric.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from evidently._pydantic_compat import parse_obj_as from evidently.metrics import ColumnValueListMetric from evidently.metrics.data_quality.column_value_list_metric import ColumnValueListMetricResult from evidently.metrics.data_quality.column_value_list_metric import ValueListStat @@ -202,7 +203,7 @@ def test_data_quality_value_list_metric_value_errors( @pytest.mark.parametrize( - "current_data, reference_data, metric, expected_json", + "current_data, reference_data, metric, old_json", ( ( pd.DataFrame({"col": [1, 2, 3]}), @@ -257,6 +258,72 @@ def test_data_quality_value_list_metric_value_errors( ), ), ) +def test_data_quality_value_list_metric_with_report_compat( + current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, old_json: dict +): + report = Report(metrics=[metric]) + report.run(current_data=current_data, reference_data=reference_data, column_mapping=ColumnMapping()) + + result = parse_obj_as(ColumnValueListMetricResult, old_json) + assert metric.get_result() == result + + +@pytest.mark.parametrize( + "current_data, reference_data, metric, expected_json", + ( + ( + pd.DataFrame({"col": [1, 2, 3]}), + None, + ColumnValueListMetric(column_name="col", values=[1]), + { + "column_name": "col", + "current": { + "number_in_list": 1, + "number_not_in_list": 2, + "rows_count": 3, + "share_in_list": 0.3333333333333333, + "share_not_in_list": 0.6666666666666666, + "values_in_list_dist": {"x": [1], "y": [1]}, + "values_not_in_list_dist": {"x": [2, 3], "y": [1, 1]}, + }, + "reference": None, + "values": [1], + }, + ), + ( + pd.DataFrame({"col1": [1, 2, 3], "col2": [10, 20, 3.5]}), + pd.DataFrame( + { + "col1": [10, 20, 3.5], + "col2": [1, 2, 3], + } + ), + ColumnValueListMetric(column_name="col1"), + { + "column_name": "col1", + "current": { + "number_in_list": 0, + "number_not_in_list": 3, + "rows_count": 3, + "share_in_list": 0.0, + "share_not_in_list": 1.0, + "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [0, 0, 0]}, + "values_not_in_list_dist": {"x": [1, 2, 3], "y": [1, 1, 1]}, + }, + "reference": { + "number_in_list": 3, + "number_not_in_list": 0, + "rows_count": 3, + "share_in_list": 1.0, + "share_not_in_list": 0.0, + "values_in_list_dist": {"x": [10.0, 20.0, 3.5], "y": [1, 1, 1]}, + "values_not_in_list_dist": {"x": [], "y": []}, + }, + "values": [10.0, 20.0, 3.5], + }, + ), + ), +) def test_data_quality_value_list_metric_with_report( current_data: pd.DataFrame, reference_data: pd.DataFrame, metric: ColumnValueListMetric, expected_json: dict ) -> None: