fix(steps): add unit test and fix null cols for impute (#157)

ibis-project · Sep 25, 2024 · b8aebcb · b8aebcb
1 parent 1b0728d
commit b8aebcb
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 0 deletions.
diff --git a/ibis_ml/steps/_impute.py b/ibis_ml/steps/_impute.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 from typing import TYPE_CHECKING, Any
 
 import ibis.expr.types as ir
@@ -9,11 +10,19 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
+import warnings
 
 _DOCS_PAGE_NAME = "imputation"
 
 
 def _fillna(col, val):
+    if val is None or (col.type().is_numeric() and math.isnan(val)):
+        warnings.warn(
+            "Imputation requires at least one non-missing value in "
+            f"column {col.get_name()!r}",
+            UserWarning,
+            stacklevel=2,
+        )
     if col.type().is_floating():
         return (col.isnull() | col.isnan()).ifelse(val, col)  # noqa: PD003
     else:

diff --git a/tests/test_impute.py b/tests/test_impute.py
@@ -0,0 +1,66 @@
+import math
+
+import ibis
+import numpy as np
+import pandas as pd
+import pandas.testing as tm
+import pytest
+
+import ibis_ml as ml
+
+
+@pytest.fixture()
+def train_table():
+    return ibis.memtable(
+        {
+            "floating_col": [0.0, 0.0, 3.0, None, np.nan],
+            "int_col": [0, 0, 3, None, None],
+            "string_col": ["a", "a", "c", None, None],
+            "null_col": [None] * 5,
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    ("mode", "col_name", "expected"),
+    [
+        ("mean", "floating_col", 1.0),
+        ("median", "floating_col", 0.0),
+        ("mode", "floating_col", 0.0),
+        ("mean", "int_col", 1),
+        ("median", "int_col", 0),
+        ("mode", "int_col", 0),
+        ("mode", "string_col", "a"),
+    ],
+)
+def test_impute(train_table, mode, col_name, expected):
+    mode_class = getattr(ml, f"Impute{mode.capitalize()}")
+    step = mode_class(col_name)
+    test_table = ibis.memtable({col_name: [None]})
+    step.fit_table(train_table, ml.core.Metadata())
+    result = step.transform_table(test_table)
+    expected = pd.DataFrame({col_name: [expected]})
+    tm.assert_frame_equal(result.execute(), expected, check_dtype=False)
+
+
+def test_fillna(train_table):
+    step = ml.FillNA("floating_col", 0)
+    step.fit_table(train_table, ml.core.Metadata())
+    assert step.is_fitted()
+    test_table = ibis.memtable({"floating_col": [None]})
+    result = step.transform_table(test_table)
+    expected = pd.DataFrame({"floating_col": [0]})
+    tm.assert_frame_equal(result.execute(), expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("val", [None, math.nan])
+def test_fillna_with_none(train_table, val):
+    step = ml.FillNA("floating_col", val)
+    step.fit_table(train_table, ml.core.Metadata())
+    test_table = ibis.memtable({"floating_col": [1.0, None]})
+    with pytest.warns(
+        UserWarning,
+        match="Imputation requires at least one non-missing value in "
+        "column 'floating_col'",
+    ):
+        step.transform_table(test_table)