Skip to content

Commit

Permalink
fix(steps): add unit test and fix null cols for impute (#157)
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 authored Sep 25, 2024
1 parent 1b0728d commit b8aebcb
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
9 changes: 9 additions & 0 deletions ibis_ml/steps/_impute.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import math
from typing import TYPE_CHECKING, Any

import ibis.expr.types as ir
Expand All @@ -9,11 +10,19 @@

if TYPE_CHECKING:
from collections.abc import Iterable
import warnings

_DOCS_PAGE_NAME = "imputation"


def _fillna(col, val):
if val is None or (col.type().is_numeric() and math.isnan(val)):
warnings.warn(
"Imputation requires at least one non-missing value in "
f"column {col.get_name()!r}",
UserWarning,
stacklevel=2,
)
if col.type().is_floating():
return (col.isnull() | col.isnan()).ifelse(val, col) # noqa: PD003
else:
Expand Down
66 changes: 66 additions & 0 deletions tests/test_impute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import math

import ibis
import numpy as np
import pandas as pd
import pandas.testing as tm
import pytest

import ibis_ml as ml


@pytest.fixture()
def train_table():
return ibis.memtable(
{
"floating_col": [0.0, 0.0, 3.0, None, np.nan],
"int_col": [0, 0, 3, None, None],
"string_col": ["a", "a", "c", None, None],
"null_col": [None] * 5,
}
)


@pytest.mark.parametrize(
("mode", "col_name", "expected"),
[
("mean", "floating_col", 1.0),
("median", "floating_col", 0.0),
("mode", "floating_col", 0.0),
("mean", "int_col", 1),
("median", "int_col", 0),
("mode", "int_col", 0),
("mode", "string_col", "a"),
],
)
def test_impute(train_table, mode, col_name, expected):
mode_class = getattr(ml, f"Impute{mode.capitalize()}")
step = mode_class(col_name)
test_table = ibis.memtable({col_name: [None]})
step.fit_table(train_table, ml.core.Metadata())
result = step.transform_table(test_table)
expected = pd.DataFrame({col_name: [expected]})
tm.assert_frame_equal(result.execute(), expected, check_dtype=False)


def test_fillna(train_table):
step = ml.FillNA("floating_col", 0)
step.fit_table(train_table, ml.core.Metadata())
assert step.is_fitted()
test_table = ibis.memtable({"floating_col": [None]})
result = step.transform_table(test_table)
expected = pd.DataFrame({"floating_col": [0]})
tm.assert_frame_equal(result.execute(), expected, check_dtype=False)


@pytest.mark.parametrize("val", [None, math.nan])
def test_fillna_with_none(train_table, val):
step = ml.FillNA("floating_col", val)
step.fit_table(train_table, ml.core.Metadata())
test_table = ibis.memtable({"floating_col": [1.0, None]})
with pytest.warns(
UserWarning,
match="Imputation requires at least one non-missing value in "
"column 'floating_col'",
):
step.transform_table(test_table)

0 comments on commit b8aebcb

Please sign in to comment.