distribute_ps_grid.py - Updated - Main portion of the script converte…

…d to reusable function. Pytest for the function added.
eciraci · Feb 5, 2024 · c72bf4c · c72bf4c
1 parent f98d5d4
commit c72bf4c
Show file tree

Hide file tree

Showing 2 changed files with 161 additions and 16 deletions.
diff --git a/distribute_ps_grid.py b/distribute_ps_grid.py
@@ -3,39 +3,133 @@
 Written by Enrico Ciraci'
 Use a Spatial Join to distribute the PS points available within
 the boundaries of a CSK frame over the relative along-track grid.
+
+usage: distribute_ps_grid.py [-h] [--out_dir OUT_DIR]
+    [--out_format {parquet,shp}] [--plot] input_file grid_file
+
+Distribute PS points over the CSK grid
+
+positional arguments:
+  input_file            Input file.
+  grid_file             CSK Along Track Grid file.
+
+options:
+  -h, --help            show this help message and exit
+  --out_dir OUT_DIR, -O OUT_DIR
+                        Output directory.
+  --out_format {parquet,shp}, -F {parquet,shp}
+                        Output file format.
+  --plot, -P            Plot the results showing the PS partition.
+
+Python Dependencies
+geopandas: Open source project to make working with geospatial data
+    in python easier: https://geopandas.org
+dask-geopandas: Distributed geospatial operations using Dask:
+    https://dask-geopandas.readthedocs.io
+matplotlib: Comprehensive library for creating static, animated, and
+    interactive visualizations in Python: https://matplotlib.org
 """
 import os
+import argparse
 from datetime import datetime
 import geopandas as gpd
+import dask_geopandas as dgpd
 import matplotlib.pyplot as plt
 
 
+def distribute_ps_grid(input_file: str, grid_file: str) -> gpd.GeoDataFrame:
+    """
+    Use a Spatial Join to distribute the PS points available within
+    Args:
+        input_file: Absolute Path to the input file.
+        grid_file: Absolute Path to the grid file.
+    Returns: None
+    """
+    if not os.path.isfile(input_file):
+        raise FileNotFoundError(f"File not found: {input_file}")
+    # - Import PS Sample Data
+    gdf_smp = dgpd.read_file(input_file, npartitions=4)
+
+    # - Import CSK AlongTrack Grid
+    if not os.path.isfile(grid_file):
+        raise FileNotFoundError(f"File not found: {grid_file}")
+    gdf_csk = gpd.read_file(grid_file)
+
+    # - Print input/output file names
+    print(f"# - Input PS Sample: {input_file}")
+    print(f"# - Input CSK Grid: {grid_file}")
+    print("# - Compute Spatial Join between PS Sample and CSK Grid.")
+
+    # - Compute spatial join between set of points and grid
+    gdf_smp = gdf_smp.sjoin(gdf_csk, how="inner", predicate="within")
+
+    return gdf_smp
+
+
 def main() -> None:
     """
     Use a Spatial Join to distribute the PS points available within
     the boundaries of a CSK frame over the relative along-track grid.
     """
-    # - import sample data
-    smp_input \
-        = os.path.join('.', 'data', 'shapefiles',
-                       'csk_ps_sample_Nocera_Terinese_A_epsg4326.shp')
+    # - Parse command line arguments
+    parser = argparse.ArgumentParser(
+        description="Distribute PS points over the CSK grid"
+    )
+    # - Input file
+    parser.add_argument('input_file', type=str,
+                        help='Input file.')
+    # - Input CSK AT Grid file
+    parser.add_argument('grid_file', type=str,
+                        help='CSK Along Track Grid file.')
+    # - Output directory - default is current working directory
+    parser.add_argument('--out_dir', '-O', type=str,
+                        help='Output directory.', default=os.getcwd())
+    # - Output file format
+    parser.add_argument('--out_format', '-F', type=str,
+                        help='Output file format.', default='parquet',
+                        choices=['parquet', 'shp'])
+    # - Plot Intermediate Results
+    parser.add_argument('--plot', '-P', action='store_true',
+                        help='Plot the results showing the PS partition.')
+    args = parser.parse_args()
 
-    # - read sample data
-    gdf_smp = gpd.read_file(smp_input)
+    # - import sample data
+    smp_input = args.input_file
 
     # - Import CSK Along Track Grid
-    csk_at_grid \
-        = os.path.join('.', 'data', 'shapefiles',
-                       'grid_CSG2_151_STR-007_ASC.shp')
-    gdf_csk = gpd.read_file(csk_at_grid)
+    csk_at_grid = args.grid_file
 
-    # - Compute spatial join between set of points and grid
-    gdf_smp = gpd.sjoin(gdf_smp, gdf_csk, how="inner", op="within")
+    # - Distribute PS points over the CSK grid
+    gdf_smp = distribute_ps_grid(smp_input, csk_at_grid
+                                 )
+    # - Drop unnecessary columns
+    print("# - Drop unnecessary columns & Convert Dask-GeoDataFrame "
+          "to GeoDataFrame.")
+    gdf_smp = gdf_smp.drop(columns=['index_right', 'type', 'rand_point',
+                                    'index', 'name',  'csm_path'])
+    gdf_smp = gdf_smp.reset_index(drop=True)
+    gdf_smp = gdf_smp.compute()
+
+    # - Save the results
+    print("# - Save the results.")
+    out_dir = args.out_dir
+    os.makedirs(out_dir, exist_ok=True)
+    out_file \
+        = os.path.join(out_dir, os.path.basename(smp_input)
+                       .replace('.shp', f'_rc.{args.out_format}'))
+
+    if args.out_format == 'shp':
+        gdf_smp.to_file(out_file)
+    else:
+        if os.path.isfile(out_file):
+            os.remove(out_file)
+        gdf_smp.to_parquet(out_file)
 
-    # - Show results of the spatial join
-    fig, ax = plt.subplots()
-    gdf_smp.plot(ax=ax, c=gdf_smp['row'], cmap='viridis', legend=True)
-    plt.show()
+    if args.plot:
+        # - Plot the results
+        fig, ax = plt.subplots()
+        gdf_smp.plot(ax=ax, c=gdf_smp['row'], cmap='viridis', legend=True)
+        plt.show()
 
 
 # - run main program

diff --git a/test/test_distribute_ps_grid.py b/test/test_distribute_ps_grid.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+""" Unit tests for distribute_ps_grid.py. """
+
+import os
+import pytest
+import time
+import dask_geopandas as dgpd
+from distribute_ps_grid import distribute_ps_grid
+
+
+def test_distribute_ps_grid():
+    # - import sample data
+    input_file \
+        = os.path.join('..', 'data', 'shapefiles',
+                       'csk_ps_sample_Nocera_Terinese_A_epsg4326.shp')
+
+    # - Import CSK Along Track Grid
+    grid_file \
+        = os.path.join('..', 'data', 'shapefiles',
+                       'grid_CSG2_151_STR-007_ASC.shp')
+    # -  Call the function
+    result = distribute_ps_grid(input_file, grid_file)
+
+    # Assertions based on your expected results
+    assert isinstance(result, dgpd.GeoDataFrame)
+
+
+def test_invalid_files():
+    with pytest.raises(FileNotFoundError):
+        distribute_ps_grid("nonexistent_file.shp", "grid_file.shp")
+
+
+def test_large_dataset_performance():
+    # Create GeoDataFrames for testing
+    start_time = time.time()
+    # - import sample data
+    input_file \
+        = os.path.join('..', 'data', 'shapefiles',
+                       'csk_ps_sample_Nocera_Terinese_A_epsg4326.shp')
+
+    # - Import CSK Along Track Grid
+    grid_file \
+        = os.path.join('..', 'data', 'shapefiles',
+                       'grid_CSG2_151_STR-007_ASC.shp')
+    # -  Call the function
+    _ = distribute_ps_grid(input_file, grid_file)
+
+    end_time = time.time()
+
+    # Set a reasonable threshold based on your performance expectations
+    assert end_time - start_time < 1    # seconds