From 6ba14a58cd81ab43e2bf2aff7fe7618e96541526 Mon Sep 17 00:00:00 2001 From: Piotr Synowiec Date: Mon, 14 Oct 2024 04:21:34 +0200 Subject: [PATCH] dev: update --- .github/workflows/tests.yaml | 1 - data_flow/data_flow.py | 29 ++++------------------------- data_flow/lib/pandas.py | 10 ---------- data_flow/lib/polars.py | 12 ------------ tests/SequenceTestCase.py | 2 ++ tests/test_data_flow_csv.py | 27 ++++++++++----------------- tests/test_data_flow_feather.py | 3 --- tests/test_data_flow_hdf.py | 3 --- tests/test_data_flow_json.py | 6 +++--- tests/test_data_flow_parquet.py | 3 --- 10 files changed, 19 insertions(+), 77 deletions(-) delete mode 100644 data_flow/lib/polars.py diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 113ca55..0b00a5a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -16,7 +16,6 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - # You can test your matrix by printing the current Python version - name: Display Python version run: python -c "import sys; print(sys.version)" - name: Install modules diff --git a/data_flow/data_flow.py b/data_flow/data_flow.py index f3061bc..243fb31 100644 --- a/data_flow/data_flow.py +++ b/data_flow/data_flow.py @@ -24,7 +24,6 @@ ) from data_flow.lib.fireducks import from_fireducks_2_file, to_fireducks_from_file from data_flow.lib.pandas import from_pandas_2_file -from data_flow.lib.polars import from_polars_2_file, to_polars_from_file from data_flow.lib.tools import generate_temporary_filename, delete_file @@ -77,14 +76,16 @@ def from_polars(self, df: pl.DataFrame): if self.__in_memory: self.__data = fd.from_pandas(df.to_pandas()) else: - from_polars_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type) + from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type) return self def to_polars(self) -> pl.DataFrame: if self.__in_memory: return pl.from_pandas(self.__data.to_pandas()) else: - return to_polars_from_file(tmp_filename=self.__filename, file_type=self.__file_type) + return pl.from_pandas( + to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() + ) def from_csv(self, filename: str): if self.__in_memory: @@ -156,28 +157,6 @@ def to_hdf(self, filename: str, key: str = "key"): to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key) return self - def head(self): - if self.__in_memory: - print(self.__data.head()) - else: - print(to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).head()) - return self - - def stats(self): - if self.__in_memory: - data = self.__data - else: - data = to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type) - - print("***** Data stats *****") - print(f"Columns names : {data.columns.to_list()}") - print(f"Columns count : {len(data.columns)}") - print(f"Rows count : {len(data)}") - print("Data types :") - print(data.dtypes) - print("**********************") - return self - def del_columns(self, columns: list): if self.__in_memory: self.__data.drop(columns=columns, inplace=True) diff --git a/data_flow/lib/pandas.py b/data_flow/lib/pandas.py index ddec9d5..c000a28 100644 --- a/data_flow/lib/pandas.py +++ b/data_flow/lib/pandas.py @@ -12,13 +12,3 @@ def from_pandas_2_file(df: pd.DataFrame, tmp_filename: str, file_type: FileType) fd.from_pandas(df).to_feather(tmp_filename) case _: raise ValueError(f"File type not implemented: {file_type} !") - - -def to_pandas_from_file(tmp_filename: str, file_type: FileType) -> fd.DataFrame: - match file_type: - case FileType.parquet: - return pd.read_parquet(tmp_filename) - case FileType.feather: - return pd.read_feather(tmp_filename) - case _: - raise ValueError(f"File type not implemented: {file_type} !") diff --git a/data_flow/lib/polars.py b/data_flow/lib/polars.py deleted file mode 100644 index 384b9e7..0000000 --- a/data_flow/lib/polars.py +++ /dev/null @@ -1,12 +0,0 @@ -import polars as pl - -from data_flow.lib import FileType -from data_flow.lib.pandas import to_pandas_from_file, from_pandas_2_file - - -def from_polars_2_file(df: pl.DataFrame, tmp_filename: str, file_type: FileType) -> None: - from_pandas_2_file(df=df.to_pandas(), tmp_filename=tmp_filename, file_type=file_type) - - -def to_polars_from_file(tmp_filename: str, file_type: FileType) -> pl.DataFrame: - return pl.from_pandas(to_pandas_from_file(tmp_filename=tmp_filename, file_type=file_type)) diff --git a/tests/SequenceTestCase.py b/tests/SequenceTestCase.py index 0fb182a..75ea929 100644 --- a/tests/SequenceTestCase.py +++ b/tests/SequenceTestCase.py @@ -4,6 +4,8 @@ class SequenceTestCase(BaseTestCase): def _sequence(self, data: DataFlow.DataFrame) -> None: + self.assertPandasEqual(data.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + polars = data.to_polars() self.assertEqual(10, len(data.columns())) diff --git a/tests/test_data_flow_csv.py b/tests/test_data_flow_csv.py index 5e0cf73..9eb8631 100644 --- a/tests/test_data_flow_csv.py +++ b/tests/test_data_flow_csv.py @@ -2,36 +2,29 @@ from data_flow import DataFlow from data_flow.lib import FileType +from data_flow.lib.tools import delete_file from tests.SequenceTestCase import SequenceTestCase class DataFlowCSVTestCase(SequenceTestCase): + def setUp(self): + super().setUp() + delete_file(self.TEST_CSV_FILE) + DataFlow().DataFrame().from_csv(self.CSV_FILE).to_csv(self.TEST_CSV_FILE) + def test_memory(self): - df = ( - DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") - ) - df.to_csv(self.TEST_CSV_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + df = DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE) + self._sequence(data=df) def test_parquet(self): - df = ( - DataFlow() - .DataFrame(in_memory=False) - .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") - ) + df = DataFlow().DataFrame(in_memory=False).from_csv(self.TEST_CSV_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_feather(self): - df = ( - DataFlow() - .DataFrame(in_memory=False, file_type=FileType.feather) - .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") - ) + df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_csv(self.TEST_CSV_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) diff --git a/tests/test_data_flow_feather.py b/tests/test_data_flow_feather.py index dd7d93a..74951e6 100644 --- a/tests/test_data_flow_feather.py +++ b/tests/test_data_flow_feather.py @@ -15,19 +15,16 @@ def setUp(self): def test_memory(self): df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_parquet(self): df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_feather(self): df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) diff --git a/tests/test_data_flow_hdf.py b/tests/test_data_flow_hdf.py index 968fe1b..dc1680e 100644 --- a/tests/test_data_flow_hdf.py +++ b/tests/test_data_flow_hdf.py @@ -15,19 +15,16 @@ def setUp(self): def test_memory(self): df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_parquet(self): df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_feather(self): df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) diff --git a/tests/test_data_flow_json.py b/tests/test_data_flow_json.py index 22b1aa4..fa04d2f 100644 --- a/tests/test_data_flow_json.py +++ b/tests/test_data_flow_json.py @@ -14,17 +14,17 @@ def setUp(self): def test_memory(self): df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) diff --git a/tests/test_data_flow_parquet.py b/tests/test_data_flow_parquet.py index aed9d8a..abc3008 100644 --- a/tests/test_data_flow_parquet.py +++ b/tests/test_data_flow_parquet.py @@ -15,19 +15,16 @@ def setUp(self): def test_memory(self): df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_parquet(self): df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df) def test_feather(self): df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE) - self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) self._sequence(data=df)