diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2ef0dca --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.0.1] - 2024-10-16 + +### Added +- initial version diff --git a/Makefile b/Makefile index b13ad83..83d13b6 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,6 @@ upload-test:: upload:: $(MAKE) build . venv/bin/activate && python -m twine upload -u $${PYPI_USER} -p $${PYPI_PASS} --verbose dist/* + +docs:: + venv/bin/pdoc mysiar_data_flow/ -o docs/ diff --git a/README.md b/README.md index be0f7d6..4a86d1d 100644 --- a/README.md +++ b/README.md @@ -21,17 +21,9 @@ library to manipulate data -## Installation instructions +## Installation ```sh pip install mysiar-data-flow ``` -## DataFlow.DataFrame - -### Usage -For now check [mysiar_data_flow/data_flow.py](mysiar_data_flow/data_flow.py) file for interface - - - -![work in progress](.github/5578703.png) diff --git a/Usage.md b/Usage.md new file mode 100644 index 0000000..1da0a52 --- /dev/null +++ b/Usage.md @@ -0,0 +1,10 @@ +# Usage + +## DataFlow.DataFrame + + +For now check [mysiar_data_flow/data_flow.py](https://github.com/mysiar-org/python-data-flow/blob/master/mysiar_data_flow/data_flow.py) file for interface + + + +![work in progress](.github/5578703.png) diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +* diff --git a/mysiar_data_flow/__init__.py b/mysiar_data_flow/__init__.py index b98e3ed..9f82913 100644 --- a/mysiar_data_flow/__init__.py +++ b/mysiar_data_flow/__init__.py @@ -1 +1,7 @@ +""" + .. include:: ../README.md + .. include:: ../Usage.md + .. include:: ../CHANGELOG.md +""" + from .data_flow import DataFlow diff --git a/mysiar_data_flow/data_flow.py b/mysiar_data_flow/data_flow.py index fc1994a..5352368 100644 --- a/mysiar_data_flow/data_flow.py +++ b/mysiar_data_flow/data_flow.py @@ -53,7 +53,7 @@ def __del__(self): if not self.__in_memory: delete_file(self.__filename) - def from_fireducks(self, df: fd.DataFrame): + def from_fireducks(self, df: fd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = df else: @@ -66,7 +66,7 @@ def to_fireducks(self) -> fd.DataFrame: else: return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type) - def from_pandas(self, df: pd.DataFrame): + def from_pandas(self, df: pd.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(df) else: @@ -79,7 +79,7 @@ def to_pandas(self) -> pd.DataFrame: else: return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() - def from_polars(self, df: pl.DataFrame): + def from_polars(self, df: pl.DataFrame) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(df.to_pandas()) else: @@ -94,70 +94,70 @@ def to_polars(self) -> pl.DataFrame: to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas() ) - def from_csv(self, filename: str): + def from_csv(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_csv(filename) else: from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_csv(self, filename: str, index=False): + def to_csv(self, filename: str, index=False) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_csv(filename, index=index) else: to_csv_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_feather(self, filename: str): + def from_feather(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.from_pandas(feather.read_feather(filename)) else: from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_feather(self, filename: str): + def to_feather(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_feather(filename) else: to_feather_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_parquet(self, filename: str): + def from_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_parquet(filename) else: from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_parquet(self, filename: str): + def to_parquet(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_parquet(filename) else: to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_json(self, filename: str): + def from_json(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_json(filename) else: from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_json(self, filename: str): + def to_json(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_json(filename) else: to_json_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def from_hdf(self, filename: str): + def from_hdf(self, filename: str) -> "DataFlow.DataFrame": if self.__in_memory: self.__data = fd.read_hdf(filename) else: from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type) return self - def to_hdf(self, filename: str, key: str = "key"): + def to_hdf(self, filename: str, key: str = "key") -> "DataFlow.DataFrame": if self.__in_memory: self.__data.to_hdf(path_or_buf=filename, key=key) else: @@ -165,12 +165,17 @@ def to_hdf(self, filename: str, key: str = "key"): return self def columns(self) -> list: + """ + lists columns in data frame + + :return: list - list of columns in data frame + """ if self.__in_memory: return self.__data.columns.to_list() else: return data_get_columns(tmp_filename=self.__filename, file_type=self.__file_type) - def columns_delete(self, columns: list): + def columns_delete(self, columns: list) -> "DataFlow.DataFrame": if self.__in_memory: self.__data.drop(columns=columns, inplace=True) else: @@ -178,7 +183,13 @@ def columns_delete(self, columns: list): return self - def columns_rename(self, columns_mapping: dict): + def columns_rename(self, columns_mapping: dict) -> "DataFlow.DataFrame": + """ + rename columns + + :param columns_mapping: dict - old_name: new_name pairs ex. {"Year": "year", "Units": "units"} + :return: + """ if self.__in_memory: self.__data.rename(columns=columns_mapping, inplace=True) else: @@ -189,13 +200,19 @@ def columns_rename(self, columns_mapping: dict): ) return self - def columns_select(self, columns: list): + def columns_select(self, columns: list) -> "DataFlow.DataFrame": + """ + columns select - columns to keep in data frame + :param columns: + :return: + """ if self.__in_memory: self.__data = self.__data[columns] else: data_select_columns(tmp_filename=self.__filename, file_type=self.__file_type, columns=columns) + return self - def filter_on_column(self, column: str, value: Any, operator: Operator): + def filter_on_column(self, column: str, value: Any, operator: Operator) -> "DataFlow.DataFrame": if self.__in_memory: match operator: case Operator.Eq: @@ -218,3 +235,4 @@ def filter_on_column(self, column: str, value: Any, operator: Operator): value=value, operator=operator, ) + return self diff --git a/requirements.dev.txt b/requirements.dev.txt index 9440ad0..dc5b93e 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -4,4 +4,5 @@ pyproject-flake8 pytest pytest-cov poetry -twine \ No newline at end of file +twine +pdoc \ No newline at end of file