Skip to content

Commit

Permalink
dev: update
Browse files Browse the repository at this point in the history
  • Loading branch information
mysiar committed Oct 14, 2024
1 parent 9e0b0c5 commit 6ba14a5
Show file tree
Hide file tree
Showing 10 changed files with 19 additions and 77 deletions.
1 change: 0 additions & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
# You can test your matrix by printing the current Python version
- name: Display Python version
run: python -c "import sys; print(sys.version)"
- name: Install modules
Expand Down
29 changes: 4 additions & 25 deletions data_flow/data_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
)
from data_flow.lib.fireducks import from_fireducks_2_file, to_fireducks_from_file
from data_flow.lib.pandas import from_pandas_2_file
from data_flow.lib.polars import from_polars_2_file, to_polars_from_file
from data_flow.lib.tools import generate_temporary_filename, delete_file


Expand Down Expand Up @@ -77,14 +76,16 @@ def from_polars(self, df: pl.DataFrame):
if self.__in_memory:
self.__data = fd.from_pandas(df.to_pandas())
else:
from_polars_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type)
return self

def to_polars(self) -> pl.DataFrame:
if self.__in_memory:
return pl.from_pandas(self.__data.to_pandas())
else:
return to_polars_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
return pl.from_pandas(
to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
)

def from_csv(self, filename: str):
if self.__in_memory:
Expand Down Expand Up @@ -156,28 +157,6 @@ def to_hdf(self, filename: str, key: str = "key"):
to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key)
return self

def head(self):
if self.__in_memory:
print(self.__data.head())
else:
print(to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).head())
return self

def stats(self):
if self.__in_memory:
data = self.__data
else:
data = to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type)

print("***** Data stats *****")
print(f"Columns names : {data.columns.to_list()}")
print(f"Columns count : {len(data.columns)}")
print(f"Rows count : {len(data)}")
print("Data types :")
print(data.dtypes)
print("**********************")
return self

def del_columns(self, columns: list):
if self.__in_memory:
self.__data.drop(columns=columns, inplace=True)
Expand Down
10 changes: 0 additions & 10 deletions data_flow/lib/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,3 @@ def from_pandas_2_file(df: pd.DataFrame, tmp_filename: str, file_type: FileType)
fd.from_pandas(df).to_feather(tmp_filename)
case _:
raise ValueError(f"File type not implemented: {file_type} !")


def to_pandas_from_file(tmp_filename: str, file_type: FileType) -> fd.DataFrame:
match file_type:
case FileType.parquet:
return pd.read_parquet(tmp_filename)
case FileType.feather:
return pd.read_feather(tmp_filename)
case _:
raise ValueError(f"File type not implemented: {file_type} !")
12 changes: 0 additions & 12 deletions data_flow/lib/polars.py

This file was deleted.

2 changes: 2 additions & 0 deletions tests/SequenceTestCase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

class SequenceTestCase(BaseTestCase):
def _sequence(self, data: DataFlow.DataFrame) -> None:
self.assertPandasEqual(data.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())

polars = data.to_polars()

self.assertEqual(10, len(data.columns()))
Expand Down
27 changes: 10 additions & 17 deletions tests/test_data_flow_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,29 @@

from data_flow import DataFlow
from data_flow.lib import FileType
from data_flow.lib.tools import delete_file
from tests.SequenceTestCase import SequenceTestCase


class DataFlowCSVTestCase(SequenceTestCase):
def setUp(self):
super().setUp()
delete_file(self.TEST_CSV_FILE)
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_csv(self.TEST_CSV_FILE)

def test_memory(self):
df = (
DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
df.to_csv(self.TEST_CSV_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
df = DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE)

self._sequence(data=df)

def test_parquet(self):
df = (
DataFlow()
.DataFrame(in_memory=False)
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
df = DataFlow().DataFrame(in_memory=False).from_csv(self.TEST_CSV_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
df = (
DataFlow()
.DataFrame(in_memory=False, file_type=FileType.feather)
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_csv(self.TEST_CSV_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


Expand Down
3 changes: 0 additions & 3 deletions tests/test_data_flow_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,16 @@ def setUp(self):
def test_memory(self):
df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


Expand Down
3 changes: 0 additions & 3 deletions tests/test_data_flow_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,16 @@ def setUp(self):
def test_memory(self):
df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


Expand Down
6 changes: 3 additions & 3 deletions tests/test_data_flow_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ def setUp(self):

def test_memory(self):
df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())

self._sequence(data=df)

def test_parquet(self):
df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())

self._sequence(data=df)

def test_feather(self):
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())

self._sequence(data=df)


Expand Down
3 changes: 0 additions & 3 deletions tests/test_data_flow_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,16 @@ def setUp(self):
def test_memory(self):
df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


Expand Down

0 comments on commit 6ba14a5

Please sign in to comment.