diff --git a/.github/workflows/ci.yaml b/.github/workflows/tests.yaml similarity index 80% rename from .github/workflows/ci.yaml rename to .github/workflows/tests.yaml index 28191fe..2dc46ae 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/tests.yaml @@ -1,14 +1,14 @@ -name: Python data-flow +name: Python data-flow Tests -on: [push] +on: [ push ] jobs: - build: + tests: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: [ "3.10", "3.11", "3.12" ] steps: - uses: actions/checkout@v4 @@ -22,4 +22,6 @@ jobs: - name: Install modules run: pip install -r requirements.txt && pip install -r requirements.dev.txt - name: Tests - run: PYTHONPATH=. pytest --cov=data_flow --cov-report term \ No newline at end of file + run: PYTHONPATH=. pytest --cov=data_flow --cov-report term + - name: Lint + run: flake8 data_flow/ \ No newline at end of file diff --git a/Makefile b/Makefile index b6f02c1..6a65b8b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ pip:: venv/bin/pip install -r requirements.dev.txt tests:: - PYTHONPATH=. venv/bin/pytest -rP tests/ -vvv --cov=data_flow --cov-report html --cov-report term + PYTHONPATH=. venv/bin/pytest --cov=data_flow --cov-report html --cov-report term -rP tests/ -vvv lint:: venv/bin/flake8 data_flow/ diff --git a/README.md b/README.md index d03f271..0d28884 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DataFlow -![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/ci.yaml/badge.svg) +![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/tests.yaml/badge.svg) [![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/) [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/) [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120/) diff --git a/data_flow/data_flow.py b/data_flow/data_flow.py index 84d304d..a699d55 100644 --- a/data_flow/data_flow.py +++ b/data_flow/data_flow.py @@ -3,6 +3,7 @@ import fireducks.pandas as fd import pandas as pd +import polars as pl from pyarrow import feather from data_flow.lib import FileType @@ -44,18 +45,26 @@ def __del__(self): if not self.__in_memory: delete_file(self.__filename) - def get_data_fireducks(self) -> fd.DataFrame: + def to_fireducks(self) -> fd.DataFrame: if self.__in_memory: return self.__data else: return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type) - def get_data_pandas(self) -> pd.DataFrame: + def to_pandas(self) -> pd.DataFrame: if self.__in_memory: return self.__data.to_pandas() else: return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() + def to_polars(self) -> pl.DataFrame: + if self.__in_memory: + return pl.from_pandas(self.__data.to_pandas()) + else: + return pl.from_pandas( + df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() + ) + def from_csv(self, filename: str): if self.__in_memory: self.__data = fd.read_csv(filename) diff --git a/requirements.txt b/requirements.txt index 5002da0..7d3d4f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ fireducks tables pyarrow -pandas \ No newline at end of file +pandas +polars \ No newline at end of file diff --git a/tests/BaseTestCase.py b/tests/BaseTestCase.py index db5501c..be9cf19 100644 --- a/tests/BaseTestCase.py +++ b/tests/BaseTestCase.py @@ -1,10 +1,12 @@ import unittest from zipfile import ZipFile +import pandas as pd + class BaseTestCase(unittest.TestCase): def setUp(self): - zip = ZipFile(self.ZIP_FILE).extractall("./tests/data") + ZipFile(self.ZIP_FILE).extractall("./tests/data") ZIP_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.zip" CSV_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv" @@ -13,3 +15,6 @@ def setUp(self): TEST_CSV_FILE = "/tmp/data-flow.csv" TEST_JSON_FILE = "/tmp/data-flow.json" TEST_HDF_FILE = "/tmp/data-flow.h5" + + def assertPandasEqual(self, df1: pd.DataFrame, df2: pd.DataFrame): + self.assertTrue(df1.equals(df2), "Pandas DataFrames are not equal !") diff --git a/tests/SequenceTestCase.py b/tests/SequenceTestCase.py index 25f11e9..df44173 100644 --- a/tests/SequenceTestCase.py +++ b/tests/SequenceTestCase.py @@ -19,6 +19,7 @@ def _sequence(self, data: DataFlow.DataFrame) -> None: "Variable_category", ] ) + self.assertEqual(3, len(data.columns())) self.assertListEqual(["Year", "Units", "Value"], data.columns()) diff --git a/tests/test_base_test_case.py b/tests/test_base_test_case.py new file mode 100644 index 0000000..3bde473 --- /dev/null +++ b/tests/test_base_test_case.py @@ -0,0 +1,22 @@ +import unittest + +import pandas as pd + +from tests.BaseTestCase import BaseTestCase + + +class BaseTestCaseTestCase(BaseTestCase): + def test_assert_pandas_equal(self): + df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + df2 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}) + df3 = pd.DataFrame({"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]}) + + self.assertPandasEqual(df1, df2) + + with self.assertRaises(AssertionError) as context: + self.assertPandasEqual(df1, df3) + self.assertEqual(str(context.exception), "False is not true : Pandas DataFrames are not equal !") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_data_flow.py b/tests/test_data_flow.py deleted file mode 100644 index 231b976..0000000 --- a/tests/test_data_flow.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import unittest -from typing import List - -from data_flow import DataFlow - -DATA_CSV = "/tmp/data_flow.csv" -DATA_FEATHER = "/tmp/data_flow.feather" -DATA_JSON = "/tmp/data_flow.json" -DATA_PARQUET = "/tmp/data_parquet.parquet" -DATA_HDF = "/tmp/data_flow.h5" - - -def delete_file(filename: str) -> None: - if os.path.exists(filename): - os.remove(filename) - - -def delete_files(files: List[str]) -> None: - for file in files: - delete_file(file) - - -@unittest.skip -class DataFlowTestCase(unittest.TestCase): - def setUp(self): - delete_files([DATA_CSV, DATA_FEATHER, DATA_JSON, DATA_PARQUET, DATA_HDF]) - - ( - DataFlow() - .DataFrame() - .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") - .to_csv(DATA_CSV) - .to_feather(DATA_FEATHER) - .to_json(DATA_JSON) - .to_parquet(DATA_PARQUET) - .to_hdf(DATA_HDF) - ) - - def test_csv(self): - data = DataFlow().DataFrame() - data.from_csv(DATA_CSV).stats() - data.del_columns( - [ - "Industry_aggregation_NZSIOC", - "Industry_code_NZSIOC", - "Industry_name_NZSIOC", - "Industry_code_ANZSIC06", - "Variable_code", - "Variable_name", - "Variable_category", - ] - ).stats() - - self.assertEqual(3, len(data.columns())) - self.assertListEqual(["Year", "Units", "Value"], data.columns()) - - @unittest.skip - def test_from_feather(self): - data = DataFlow().DataFrame() - data.from_feather(DATA_FEATHER).stats() - - @unittest.skip - def test_from_json(self): - data = DataFlow().DataFrame() - data.from_json(DATA_JSON).stats() - - @unittest.skip - def test_from_parquet(self): - data = DataFlow().DataFrame() - data.from_parquet(DATA_PARQUET).stats() - - @unittest.skip - def test_from_hdf(self): - data = DataFlow().DataFrame() - data.from_hdf(DATA_HDF).stats() - - def test(self): - DataFlow().DataFrame(in_memory=False).print() - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_data_flow_csv.py b/tests/test_data_flow_csv.py index c37e375..f8ce669 100644 --- a/tests/test_data_flow_csv.py +++ b/tests/test_data_flow_csv.py @@ -6,29 +6,33 @@ class DataFlowCSVTestCase(SequenceTestCase): def test_memory(self): - data = ( + df = ( DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") ) - self._sequence(data=data) + df.to_csv(self.TEST_CSV_FILE) - data.to_csv(self.TEST_CSV_FILE) - data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE).get_data_pandas()) + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): - data = ( + df = ( DataFlow() .DataFrame(in_memory=False) .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") ) - self._sequence(data=data) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): - data = ( + df = ( DataFlow() .DataFrame(in_memory=False, file_type=FileType.feather) .from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv") ) - self._sequence(data=data) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) if __name__ == "__main__": diff --git a/tests/test_data_flow_feather.py b/tests/test_data_flow_feather.py index 77b48b4..4a47ddf 100644 --- a/tests/test_data_flow_feather.py +++ b/tests/test_data_flow_feather.py @@ -12,17 +12,22 @@ def setUp(self): DataFlow().DataFrame().from_csv(self.CSV_FILE).to_feather(self.TEST_FEATHER_FILE) def test_memory(self): - data = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE) - self._sequence(data=data) - data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas()) + df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): - data = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): - data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) if __name__ == "__main__": diff --git a/tests/test_data_flow_hdf.py b/tests/test_data_flow_hdf.py index cdffefc..8bad9ce 100644 --- a/tests/test_data_flow_hdf.py +++ b/tests/test_data_flow_hdf.py @@ -12,17 +12,22 @@ def setUp(self): DataFlow().DataFrame().from_csv(self.CSV_FILE).to_hdf(self.TEST_HDF_FILE) def test_memory(self): - data = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE) - self._sequence(data=data) - data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas()) + df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): - data = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): - data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) if __name__ == "__main__": diff --git a/tests/test_data_flow_json.py b/tests/test_data_flow_json.py index b35f556..c5dfc71 100644 --- a/tests/test_data_flow_json.py +++ b/tests/test_data_flow_json.py @@ -12,17 +12,19 @@ def setUp(self): DataFlow().DataFrame().from_csv(self.CSV_FILE).to_json(self.TEST_JSON_FILE) def test_memory(self): - data = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE) - self._sequence(data=data) - data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas()) + df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE) + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): - data = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE) + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): - data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE) + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) if __name__ == "__main__": diff --git a/tests/test_data_flow_parquet.py b/tests/test_data_flow_parquet.py index 78a26cd..1b1dcee 100644 --- a/tests/test_data_flow_parquet.py +++ b/tests/test_data_flow_parquet.py @@ -12,17 +12,22 @@ def setUp(self): DataFlow().DataFrame().from_csv(self.CSV_FILE).to_parquet(self.TEST_PARQUET_FILE) def test_memory(self): - data = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE) - self._sequence(data=data) - data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas()) + df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_parquet(self): - data = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) def test_feather(self): - data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE) - self._sequence(data=data) + df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE) + + self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas()) + self._sequence(data=df) if __name__ == "__main__":