Skip to content

Commit

Permalink
dev: small refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
mysiar committed Oct 13, 2024
1 parent c60172c commit b2fbfe9
Show file tree
Hide file tree
Showing 14 changed files with 108 additions and 130 deletions.
12 changes: 7 additions & 5 deletions .github/workflows/ci.yaml → .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
name: Python data-flow
name: Python data-flow Tests

on: [push]
on: [ push ]

jobs:
build:
tests:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
python-version: [ "3.10", "3.11", "3.12" ]

steps:
- uses: actions/checkout@v4
Expand All @@ -22,4 +22,6 @@ jobs:
- name: Install modules
run: pip install -r requirements.txt && pip install -r requirements.dev.txt
- name: Tests
run: PYTHONPATH=. pytest --cov=data_flow --cov-report term
run: PYTHONPATH=. pytest --cov=data_flow --cov-report term
- name: Lint
run: flake8 data_flow/
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pip::
venv/bin/pip install -r requirements.dev.txt

tests::
PYTHONPATH=. venv/bin/pytest -rP tests/ -vvv --cov=data_flow --cov-report html --cov-report term
PYTHONPATH=. venv/bin/pytest --cov=data_flow --cov-report html --cov-report term -rP tests/ -vvv

lint::
venv/bin/flake8 data_flow/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DataFlow

![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/ci.yaml/badge.svg)
![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/tests.yaml/badge.svg)
[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/)
[![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/)
[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120/)
Expand Down
13 changes: 11 additions & 2 deletions data_flow/data_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import fireducks.pandas as fd
import pandas as pd
import polars as pl
from pyarrow import feather

from data_flow.lib import FileType
Expand Down Expand Up @@ -44,18 +45,26 @@ def __del__(self):
if not self.__in_memory:
delete_file(self.__filename)

def get_data_fireducks(self) -> fd.DataFrame:
def to_fireducks(self) -> fd.DataFrame:
if self.__in_memory:
return self.__data
else:
return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type)

def get_data_pandas(self) -> pd.DataFrame:
def to_pandas(self) -> pd.DataFrame:
if self.__in_memory:
return self.__data.to_pandas()
else:
return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()

def to_polars(self) -> pl.DataFrame:
if self.__in_memory:
return pl.from_pandas(self.__data.to_pandas())
else:
return pl.from_pandas(
df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
)

def from_csv(self, filename: str):
if self.__in_memory:
self.__data = fd.read_csv(filename)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
fireducks
tables
pyarrow
pandas
pandas
polars
7 changes: 6 additions & 1 deletion tests/BaseTestCase.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import unittest
from zipfile import ZipFile

import pandas as pd


class BaseTestCase(unittest.TestCase):
def setUp(self):
zip = ZipFile(self.ZIP_FILE).extractall("./tests/data")
ZipFile(self.ZIP_FILE).extractall("./tests/data")

ZIP_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.zip"
CSV_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv"
Expand All @@ -13,3 +15,6 @@ def setUp(self):
TEST_CSV_FILE = "/tmp/data-flow.csv"
TEST_JSON_FILE = "/tmp/data-flow.json"
TEST_HDF_FILE = "/tmp/data-flow.h5"

def assertPandasEqual(self, df1: pd.DataFrame, df2: pd.DataFrame):
self.assertTrue(df1.equals(df2), "Pandas DataFrames are not equal !")
1 change: 1 addition & 0 deletions tests/SequenceTestCase.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def _sequence(self, data: DataFlow.DataFrame) -> None:
"Variable_category",
]
)

self.assertEqual(3, len(data.columns()))
self.assertListEqual(["Year", "Units", "Value"], data.columns())

Expand Down
22 changes: 22 additions & 0 deletions tests/test_base_test_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest

import pandas as pd

from tests.BaseTestCase import BaseTestCase


class BaseTestCaseTestCase(BaseTestCase):
def test_assert_pandas_equal(self):
df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
df2 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
df3 = pd.DataFrame({"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]})

self.assertPandasEqual(df1, df2)

with self.assertRaises(AssertionError) as context:
self.assertPandasEqual(df1, df3)
self.assertEqual(str(context.exception), "False is not true : Pandas DataFrames are not equal !")


if __name__ == "__main__":
unittest.main()
83 changes: 0 additions & 83 deletions tests/test_data_flow.py

This file was deleted.

20 changes: 12 additions & 8 deletions tests/test_data_flow_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,33 @@

class DataFlowCSVTestCase(SequenceTestCase):
def test_memory(self):
data = (
df = (
DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
self._sequence(data=data)
df.to_csv(self.TEST_CSV_FILE)

data.to_csv(self.TEST_CSV_FILE)
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE).get_data_pandas())
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
data = (
df = (
DataFlow()
.DataFrame(in_memory=False)
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
self._sequence(data=data)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
data = (
df = (
DataFlow()
.DataFrame(in_memory=False, file_type=FileType.feather)
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
)
self._sequence(data=data)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


if __name__ == "__main__":
Expand Down
19 changes: 12 additions & 7 deletions tests/test_data_flow_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,22 @@ def setUp(self):
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_feather(self.TEST_FEATHER_FILE)

def test_memory(self):
data = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)
self._sequence(data=data)
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
data = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


if __name__ == "__main__":
Expand Down
19 changes: 12 additions & 7 deletions tests/test_data_flow_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,22 @@ def setUp(self):
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_hdf(self.TEST_HDF_FILE)

def test_memory(self):
data = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)
self._sequence(data=data)
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
data = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


if __name__ == "__main__":
Expand Down
16 changes: 9 additions & 7 deletions tests/test_data_flow_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,19 @@ def setUp(self):
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_json(self.TEST_JSON_FILE)

def test_memory(self):
data = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
self._sequence(data=data)
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
data = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


if __name__ == "__main__":
Expand Down
19 changes: 12 additions & 7 deletions tests/test_data_flow_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,22 @@ def setUp(self):
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_parquet(self.TEST_PARQUET_FILE)

def test_memory(self):
data = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)
self._sequence(data=data)
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_parquet(self):
data = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)

def test_feather(self):
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)
self._sequence(data=data)
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)

self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
self._sequence(data=df)


if __name__ == "__main__":
Expand Down

0 comments on commit b2fbfe9

Please sign in to comment.