Skip to content

Commit

Permalink
Merge pull request #558 from graphistry/fix/hetero_feat
Browse files Browse the repository at this point in the history
homogenize object cols to str
  • Loading branch information
aucahuasi authored Apr 6, 2024
2 parents 2506b79 + c1a20aa commit d140bb3
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Development]

## [0.33.6 - 2024-04-05]

### Added

* `featurize()`, on error, coerces `object` dtype cols to `.astype(str)` and retries

## [0.33.5 - 2024-03-11]

### Fixed
Expand Down
9 changes: 8 additions & 1 deletion graphistry/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,14 @@ def process_dirty_dataframes(

logger.info(":: Encoding DataFrame might take a few minutes ------")

X_enc = data_encoder.fit_transform(ndf, y)
try:
X_enc = data_encoder.fit_transform(ndf, y)
except TypeError:
nndf = ndf.copy()
object_columns = nndf.select_dtypes(include=['object']).columns
nndf[object_columns] = nndf[object_columns].astype(str)
X_enc = data_encoder.fit_transform(nndf, y)
logger.info("obj columns: %s are being converted to str", object_columns)
X_enc = make_array(X_enc)

import warnings
Expand Down
16 changes: 16 additions & 0 deletions graphistry/tests/test_umap_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,22 @@ def test_umap_simplest(self):
})
graphistry.nodes(df).umap()
assert True

@pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
def test_umap_edgecase(self):
df = pd.DataFrame({
'x': ['aa a' * 10, 'bb b' * 2, 'ccc ' * 20, 'dd abc', 'ee x1z'] * 10,
'y': [1.0, 2.0, 3.0, 4.0, 5.0] * 10,
'yy': [1.1, 20, 31, 12, 5.0] * 10,
})
df['z'] = df['x'].apply(lambda x: x[0])
df.loc[[1,20,35,42,30], 'z'] = 1
df.loc[[10,5,16,28,35], 'z'] = 1.0
df.loc[[12,7], 'z'] = 'NaN'
df.loc[[13,8], 'z'] = np.NaN

graphistry.nodes(df).umap()
assert True

@pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
def test_node_umap(self):
Expand Down

0 comments on commit d140bb3

Please sign in to comment.