Merge pull request #558 from graphistry/fix/hetero_feat

homogenize object cols to str
graphistry · Apr 6, 2024 · d140bb3 · d140bb3
2 parents 2506b79 + c1a20aa
commit d140bb3
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Development]
 
+## [0.33.6 - 2024-04-05]
+
+### Added
+
+* `featurize()`, on error, coerces `object` dtype cols to `.astype(str)` and retries
+
 ## [0.33.5 - 2024-03-11]
 
 ### Fixed

diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
@@ -904,7 +904,14 @@ def process_dirty_dataframes(
 
         logger.info(":: Encoding DataFrame might take a few minutes ------")
 
-        X_enc = data_encoder.fit_transform(ndf, y)
+        try:
+            X_enc = data_encoder.fit_transform(ndf, y)
+        except TypeError:
+            nndf = ndf.copy()
+            object_columns = nndf.select_dtypes(include=['object']).columns
+            nndf[object_columns] = nndf[object_columns].astype(str)
+            X_enc = data_encoder.fit_transform(nndf, y)
+            logger.info("obj columns: %s are being converted to str", object_columns)
         X_enc = make_array(X_enc)
 
         import warnings

diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py
@@ -384,6 +384,22 @@ def test_umap_simplest(self):
         })
         graphistry.nodes(df).umap()
         assert True
+
+    @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
+    def test_umap_edgecase(self):
+        df = pd.DataFrame({
+            'x': ['aa a' * 10, 'bb b' * 2, 'ccc ' * 20, 'dd abc', 'ee x1z'] * 10,
+            'y': [1.0, 2.0, 3.0, 4.0, 5.0] * 10,
+            'yy': [1.1, 20, 31, 12, 5.0] * 10,
+        })
+        df['z'] = df['x'].apply(lambda x: x[0])
+        df.loc[[1,20,35,42,30], 'z'] = 1
+        df.loc[[10,5,16,28,35], 'z'] = 1.0
+        df.loc[[12,7], 'z'] = 'NaN'
+        df.loc[[13,8], 'z'] = np.NaN
+
+        graphistry.nodes(df).umap()
+        assert True
 
     @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
     def test_node_umap(self):