From 6ba14a58cd81ab43e2bf2aff7fe7618e96541526 Mon Sep 17 00:00:00 2001
From: Piotr Synowiec <psynowiec@gmail.com>
Date: Mon, 14 Oct 2024 04:21:34 +0200
Subject: [PATCH] dev: update

---
 .github/workflows/tests.yaml    |  1 -
 data_flow/data_flow.py          | 29 ++++-------------------------
 data_flow/lib/pandas.py         | 10 ----------
 data_flow/lib/polars.py         | 12 ------------
 tests/SequenceTestCase.py       |  2 ++
 tests/test_data_flow_csv.py     | 27 ++++++++++-----------------
 tests/test_data_flow_feather.py |  3 ---
 tests/test_data_flow_hdf.py     |  3 ---
 tests/test_data_flow_json.py    |  6 +++---
 tests/test_data_flow_parquet.py |  3 ---
 10 files changed, 19 insertions(+), 77 deletions(-)
 delete mode 100644 data_flow/lib/polars.py

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 113ca55..0b00a5a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -16,7 +16,6 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      # You can test your matrix by printing the current Python version
       - name: Display Python version
         run: python -c "import sys; print(sys.version)"
       - name: Install modules
diff --git a/data_flow/data_flow.py b/data_flow/data_flow.py
index f3061bc..243fb31 100644
--- a/data_flow/data_flow.py
+++ b/data_flow/data_flow.py
@@ -24,7 +24,6 @@
 )
 from data_flow.lib.fireducks import from_fireducks_2_file, to_fireducks_from_file
 from data_flow.lib.pandas import from_pandas_2_file
-from data_flow.lib.polars import from_polars_2_file, to_polars_from_file
 from data_flow.lib.tools import generate_temporary_filename, delete_file
 
 
@@ -77,14 +76,16 @@ def from_polars(self, df: pl.DataFrame):
             if self.__in_memory:
                 self.__data = fd.from_pandas(df.to_pandas())
             else:
-                from_polars_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
+                from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type)
             return self
 
         def to_polars(self) -> pl.DataFrame:
             if self.__in_memory:
                 return pl.from_pandas(self.__data.to_pandas())
             else:
-                return to_polars_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
+                return pl.from_pandas(
+                    to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
+                )
 
         def from_csv(self, filename: str):
             if self.__in_memory:
@@ -156,28 +157,6 @@ def to_hdf(self, filename: str, key: str = "key"):
                 to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key)
             return self
 
-        def head(self):
-            if self.__in_memory:
-                print(self.__data.head())
-            else:
-                print(to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).head())
-            return self
-
-        def stats(self):
-            if self.__in_memory:
-                data = self.__data
-            else:
-                data = to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
-
-            print("***** Data stats *****")
-            print(f"Columns names : {data.columns.to_list()}")
-            print(f"Columns count : {len(data.columns)}")
-            print(f"Rows count    :    {len(data)}")
-            print("Data types    :")
-            print(data.dtypes)
-            print("**********************")
-            return self
-
         def del_columns(self, columns: list):
             if self.__in_memory:
                 self.__data.drop(columns=columns, inplace=True)
diff --git a/data_flow/lib/pandas.py b/data_flow/lib/pandas.py
index ddec9d5..c000a28 100644
--- a/data_flow/lib/pandas.py
+++ b/data_flow/lib/pandas.py
@@ -12,13 +12,3 @@ def from_pandas_2_file(df: pd.DataFrame, tmp_filename: str, file_type: FileType)
             fd.from_pandas(df).to_feather(tmp_filename)
         case _:
             raise ValueError(f"File type not implemented: {file_type} !")
-
-
-def to_pandas_from_file(tmp_filename: str, file_type: FileType) -> fd.DataFrame:
-    match file_type:
-        case FileType.parquet:
-            return pd.read_parquet(tmp_filename)
-        case FileType.feather:
-            return pd.read_feather(tmp_filename)
-        case _:
-            raise ValueError(f"File type not implemented: {file_type} !")
diff --git a/data_flow/lib/polars.py b/data_flow/lib/polars.py
deleted file mode 100644
index 384b9e7..0000000
--- a/data_flow/lib/polars.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import polars as pl
-
-from data_flow.lib import FileType
-from data_flow.lib.pandas import to_pandas_from_file, from_pandas_2_file
-
-
-def from_polars_2_file(df: pl.DataFrame, tmp_filename: str, file_type: FileType) -> None:
-    from_pandas_2_file(df=df.to_pandas(), tmp_filename=tmp_filename, file_type=file_type)
-
-
-def to_polars_from_file(tmp_filename: str, file_type: FileType) -> pl.DataFrame:
-    return pl.from_pandas(to_pandas_from_file(tmp_filename=tmp_filename, file_type=file_type))
diff --git a/tests/SequenceTestCase.py b/tests/SequenceTestCase.py
index 0fb182a..75ea929 100644
--- a/tests/SequenceTestCase.py
+++ b/tests/SequenceTestCase.py
@@ -4,6 +4,8 @@
 
 class SequenceTestCase(BaseTestCase):
     def _sequence(self, data: DataFlow.DataFrame) -> None:
+        self.assertPandasEqual(data.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
+
         polars = data.to_polars()
 
         self.assertEqual(10, len(data.columns()))
diff --git a/tests/test_data_flow_csv.py b/tests/test_data_flow_csv.py
index 5e0cf73..9eb8631 100644
--- a/tests/test_data_flow_csv.py
+++ b/tests/test_data_flow_csv.py
@@ -2,36 +2,29 @@
 
 from data_flow import DataFlow
 from data_flow.lib import FileType
+from data_flow.lib.tools import delete_file
 from tests.SequenceTestCase import SequenceTestCase
 
 
 class DataFlowCSVTestCase(SequenceTestCase):
+    def setUp(self):
+        super().setUp()
+        delete_file(self.TEST_CSV_FILE)
+        DataFlow().DataFrame().from_csv(self.CSV_FILE).to_csv(self.TEST_CSV_FILE)
+
     def test_memory(self):
-        df = (
-            DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
-        )
-        df.to_csv(self.TEST_CSV_FILE)
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
+        df = DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE)
+
         self._sequence(data=df)
 
     def test_parquet(self):
-        df = (
-            DataFlow()
-            .DataFrame(in_memory=False)
-            .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
-        )
+        df = DataFlow().DataFrame(in_memory=False).from_csv(self.TEST_CSV_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_feather(self):
-        df = (
-            DataFlow()
-            .DataFrame(in_memory=False, file_type=FileType.feather)
-            .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
-        )
+        df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_csv(self.TEST_CSV_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
 
diff --git a/tests/test_data_flow_feather.py b/tests/test_data_flow_feather.py
index dd7d93a..74951e6 100644
--- a/tests/test_data_flow_feather.py
+++ b/tests/test_data_flow_feather.py
@@ -15,19 +15,16 @@ def setUp(self):
     def test_memory(self):
         df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_parquet(self):
         df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_feather(self):
         df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
 
diff --git a/tests/test_data_flow_hdf.py b/tests/test_data_flow_hdf.py
index 968fe1b..dc1680e 100644
--- a/tests/test_data_flow_hdf.py
+++ b/tests/test_data_flow_hdf.py
@@ -15,19 +15,16 @@ def setUp(self):
     def test_memory(self):
         df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_parquet(self):
         df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_feather(self):
         df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
 
diff --git a/tests/test_data_flow_json.py b/tests/test_data_flow_json.py
index 22b1aa4..fa04d2f 100644
--- a/tests/test_data_flow_json.py
+++ b/tests/test_data_flow_json.py
@@ -14,17 +14,17 @@ def setUp(self):
 
     def test_memory(self):
         df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
+
         self._sequence(data=df)
 
     def test_parquet(self):
         df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
+
         self._sequence(data=df)
 
     def test_feather(self):
         df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
+
         self._sequence(data=df)
 
 
diff --git a/tests/test_data_flow_parquet.py b/tests/test_data_flow_parquet.py
index aed9d8a..abc3008 100644
--- a/tests/test_data_flow_parquet.py
+++ b/tests/test_data_flow_parquet.py
@@ -15,19 +15,16 @@ def setUp(self):
     def test_memory(self):
         df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_parquet(self):
         df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)
 
     def test_feather(self):
         df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)
 
-        self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
         self._sequence(data=df)