fwitmer · Codingbysid · Dec 29, 2025 · Dec 29, 2025
diff --git a/batchprocess.py b/batchprocess.py
@@ -14,21 +14,22 @@
 # TODO: Add method to determine if files have already been processed to prevent reprocessing
 # TODO: Add optional flag to reprocess all images
 files_to_be_processed = []
-for root, subdirs, files in os.walk(rootdir + "/data"):
+for root, subdirs, files in os.walk(os.path.join(rootdir, "data")):
     # skip any files found in the output directory
     if root.find("output") != -1:
         continue
     # have files, check to see if they're the ones we want
     if files:
         for f in files:
             if f.endswith(".tif") and f.find("_SR_") != -1:
-                pathname = root + "/" + f
+                pathname = os.path.join(root, f)
                 files_to_be_processed.append((pathname, f)) # appending (path, filename)
 
 # if we have files, check output directory structure
 if files_to_be_processed:
-    if not os.path.isdir("data/output"):
-        os.mkdir("data/output")
+    output_dir = os.path.join("data", "output")
+    if not os.path.isdir(output_dir):
+        os.mkdir(output_dir)
 
     file_groups = {}
 
@@ -52,10 +53,12 @@
 
 
         # process directory structure if not already made
-        if not os.path.isdir("data/output/{}".format(file_year)):
-            os.mkdir("data/output/{}".format(file_year))
-        if not os.path.isdir("data/output/{}/{}".format(file_year, file_month)):
-            os.mkdir("data/output/{}/{}".format(file_year, file_month))
+        year_dir = os.path.join("data", "output", file_year)
+        if not os.path.isdir(year_dir):
+            os.mkdir(year_dir)
+        month_dir = os.path.join("data", "output", file_year, file_month)
+        if not os.path.isdir(month_dir):
+            os.mkdir(month_dir)
 
         # outfile_base = f[1][:15] + "_AnalyticMS_SR"
 
@@ -78,19 +81,21 @@
                                  "height": mosaic.shape[1],
                                  "width": mosaic.shape[2],
                                  "transform": out_transform})
-                out_fp = "data/output/{year}/{month}/{year}{month}{day}_AnalyticMS_SR_merged.tif".format(year=year, month=month, day=day)
+                out_filename = "{year}{month}{day}_AnalyticMS_SR_merged.tif".format(year=year, month=month, day=day)
+                out_fp = os.path.join("data", "output", year, month, out_filename)
                 with rasterio.open(out_fp, "w", **out_meta) as dst:
                     dst.write(mosaic)
 
 # process the merged files
-for root, subdirs, files in os.walk(rootdir + "/data/output"):
+for root, subdirs, files in os.walk(os.path.join(rootdir, "data", "output")):
     if files:
         for f in files:
             if f.find("NDWI") != -1:
                 continue
-            pathname = root + "/"
+            pathname = root
             outfile_base = f.split(sep=".")[0]
-            ndwi_outfile = pathname + outfile_base + "_NDWI.tif"
-            ndwi_class_outfile = pathname + outfile_base + "_NDWI_classified.tif"
-            ndwi = rt.calculate_ndwi(pathname + f, ndwi_outfile, plot=False)
+            ndwi_outfile = os.path.join(pathname, outfile_base + "_NDWI.tif")
+            ndwi_class_outfile = os.path.join(pathname, outfile_base + "_NDWI_classified.tif")
+            input_file = os.path.join(pathname, f)
+            ndwi = rt.calculate_ndwi(input_file, ndwi_outfile, plot=False)
             ndwi_class = rt.ndwi_classify(ndwi_outfile, ndwi_class_outfile, plot=False)
diff --git a/data_preprocessing.py b/data_preprocessing.py
@@ -6,9 +6,11 @@
 import numpy as np
 import os
 import glob
+import argparse
+import sys
 
 # adapted from https://gis.stackexchange.com/questions/285499/how-to-split-multiband-image-into-image-tiles-using-rasterio
-def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False):
+def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False, output_dir="data/tiles"):
     with rio.open(image) as src:
         filepath, filename = os.path.split(image)
         file_base, file_extension = os.path.splitext(filename)
@@ -32,7 +34,9 @@ def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False):
                 if 0 in window_data[..., :-1]:
                     continue
             out_name = file_base + "_" + str(i + 1).zfill(2) + "-of-" + str(len(tiles)) + file_extension
-            out_path = os.path.join("data/tiles/", out_name)
+            out_path = os.path.join(output_dir, out_name)
+            # Create output directory if it doesn't exist
+            os.makedirs(output_dir, exist_ok=True)
             with rio.open(out_path, 'w', **meta) as dst:
                 dst.write(src.read(window=window))
 
@@ -48,20 +52,26 @@ def _flip_bands(bands):
 # takes the path to all image tiles and creates tiles that are rotated 90°, 180° and 270° as well as their flipped counterparts
 # this results in 8 tiles for every input tile (including the input tile)
 def augment_tiles(tile_path):
-    files = glob.glob(tile_path + "*.tif")
-    files = set(files) - set(glob.glob(tile_path + "*rot*"))
-    files = set(files) - set(glob.glob(tile_path + "*flip*")) 
+    # Normalize tile_path and ensure it's a directory path
+    tile_path = os.path.normpath(tile_path)
+    if not os.path.isdir(tile_path):
+        raise ValueError(f"Tile path is not a valid directory: {tile_path}")
+
+    # Use os.path.join for glob patterns (works cross-platform)
+    files = glob.glob(os.path.join(tile_path, "*.tif"))
+    files = set(files) - set(glob.glob(os.path.join(tile_path, "*rot*")))
+    files = set(files) - set(glob.glob(os.path.join(tile_path, "*flip*"))) 
     for file in files:
         filename = os.path.basename(file)
         file_base, file_extension = os.path.splitext(filename)
         # generating filepaths for new tiles
-        path_90 = tile_path + file_base + "_rot90" + file_extension
-        path_180 = tile_path + file_base + "_rot180" + file_extension
-        path_270 = tile_path + file_base + "_rot270" + file_extension
-        path_flip_name = tile_path + file_base + "_flip" + file_extension
-        path_flip_90 = tile_path + file_base + "_rot90_flip" + file_extension
-        path_flip_180 = tile_path + file_base + "_rot180_flip" + file_extension
-        path_flip_270 = tile_path + file_base + "_rot270_flip" + file_extension
+        path_90 = os.path.join(tile_path, file_base + "_rot90" + file_extension)
+        path_180 = os.path.join(tile_path, file_base + "_rot180" + file_extension)
+        path_270 = os.path.join(tile_path, file_base + "_rot270" + file_extension)
+        path_flip_name = os.path.join(tile_path, file_base + "_flip" + file_extension)
+        path_flip_90 = os.path.join(tile_path, file_base + "_rot90_flip" + file_extension)
+        path_flip_180 = os.path.join(tile_path, file_base + "_rot180_flip" + file_extension)
+        path_flip_270 = os.path.join(tile_path, file_base + "_rot270_flip" + file_extension)
 
         with rio.open(file, driver="GTiff") as src:
             # band_1 = src.read(1)
@@ -85,10 +95,98 @@ def augment_tiles(tile_path):
 
 # example usage
 if __name__ == '__main__':
-    files = glob.glob("data/labeled_inputs/*.tif")
-
+    parser = argparse.ArgumentParser(
+        description='Preprocess images by creating tiles and augmenting them',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Examples:
+  python data_preprocessing.py --input_dir data/labeled_inputs --output_dir data/tiles
+
+  python data_preprocessing.py --input_dir ./images --output_dir ./tiles --augment
+        '''
+    )
+    parser.add_argument(
+        '--input_dir',
+        type=str,
+        default='data/labeled_inputs',
+        help='Directory containing input images to tile. Default: data/labeled_inputs'
+    )
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='data/tiles',
+        help='Directory for output tiles. Default: data/tiles'
+    )
+    parser.add_argument(
+        '--augment',
+        action='store_true',
+        help='Run tile augmentation after creating tiles'
+    )
+    parser.add_argument(
+        '--tile_height',
+        type=int,
+        default=512,
+        help='Height of tiles in pixels. Default: 512'
+    )
+    parser.add_argument(
+        '--tile_width',
+        type=int,
+        default=512,
+        help='Width of tiles in pixels. Default: 512'
+    )
+    parser.add_argument(
+        '--skip_no_data',
+        action='store_true',
+        help='Skip tiles with no data values'
+    )
+    parser.add_argument(
+        '--max_workers',
+        type=int,
+        default=6,
+        help='Maximum number of worker threads. Default: 6'
+    )
+
+    args = parser.parse_args()
+
+    # Check if input directory exists
+    if not os.path.isdir(args.input_dir):
+        print(f"ERROR: Input directory does not exist: {args.input_dir}", file=sys.stderr)
+        print("Please specify a valid directory using --input_dir argument.", file=sys.stderr)
+        sys.exit(1)
+
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Find all .tif files in input directory
+    input_pattern = os.path.join(args.input_dir, "*.tif")
+    files = glob.glob(input_pattern)
+
+    if not files:
+        print(f"No .tif files found in {args.input_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(files)} files to process")
+    print(f"Output directory: {args.output_dir}")
+
     from concurrent.futures import ThreadPoolExecutor
-    with ThreadPoolExecutor(max_workers=6) as p:
-        p.map(make_tiles, files)
-
-# augment_tiles("data/tiles/")
+
+    # Create tiles
+    def process_file(file):
+        make_tiles(
+            file,
+            tile_height=args.tile_height,
+            tile_width=args.tile_width,
+            skip_no_data=args.skip_no_data,
+            output_dir=args.output_dir
+        )
+
+    with ThreadPoolExecutor(max_workers=args.max_workers) as p:
+        p.map(process_file, files)
+
+    print("Tile creation complete!")
+
+    # Augment tiles if requested
+    if args.augment:
+        print("Starting tile augmentation...")
+        augment_tiles(args.output_dir)
+        print("Tile augmentation complete!")
diff --git a/process_snap_data.py b/process_snap_data.py
@@ -28,6 +28,8 @@
 from datetime import datetime, date, timedelta
 import time
 import math
+import argparse
+import sys
 
 # WKT Used to convert Lat/Lon to projection used in snap data
 CRS_WKT = 'PROJCS["unnamed",GEOGCS["unnamed ellipse",DATUM["unknown",SPHEROID["unnamed",6370000,0]],PRIMEM["Greenwich",' \
@@ -39,19 +41,6 @@
 # Center of region in lat/lon
 REGION_CENTER = (66.0756, -162.7172)
 
-# Filepaths for SNAP data
-
-TSK_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/tsk'
-U10_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/u10'
-V10_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/v10'
-SEAICE_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/seaice'
-PSFC_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/psfc'
-T2_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/t2'
-TRANSECT_FILEPATH = '/usr/local/coastal/snap_processing/transect_data/WestChukchi_exposed_STepr_rates.shp'
-
-# Filepath for I/O csv data
-OUTPUT_FILEPATH = '/usr/local/coastal/snap_processing/snap_output/SNAP_daily_by_transect'
-
 # Create global variables
 global transformer_to_snap_proj
 global transformer_to_lat_lon
@@ -112,7 +101,7 @@ def read_data(filepath):
     if os.path.isdir(filepath):
         for filename in os.scandir(filepath):
             try:
-                data.append(nc.Dataset(filepath + '/' + filename.name))
+                data.append(nc.Dataset(os.path.join(filepath, filename.name)))
             except OSError:
                 pass
 
@@ -288,6 +277,91 @@ def calculate_wind_data(dataframe):
 
 if __name__ == '__main__':
 
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description='Process SNAP climate data for coastline extraction',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Examples:
+  python process_snap_data.py --snap_data_dir ./data/snap_data --transect_file ./data/transects/WestChukchi_exposed_STepr_rates.shp --output_dir ./data/output
+
+  python process_snap_data.py --snap_data_dir /usr/local/coastal/snap_processing/snap_data --transect_file /usr/local/coastal/snap_processing/transect_data/WestChukchi_exposed_STepr_rates.shp --output_dir /usr/local/coastal/snap_processing/snap_output
+        '''
+    )
+    parser.add_argument(
+        '--snap_data_dir',
+        type=str,
+        default='./data/snap_data',
+        help='Base directory containing SNAP data subdirectories (tsk, u10, v10, seaice, psfc, t2). Default: ./data/snap_data'
+    )
+    parser.add_argument(
+        '--transect_file',
+        type=str,
+        default='./data/transect_data/WestChukchi_exposed_STepr_rates.shp',
+        help='Path to transect shapefile. Default: ./data/transect_data/WestChukchi_exposed_STepr_rates.shp'
+    )
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='./data/snap_output',
+        help='Directory for output CSV files. Default: ./data/snap_output'
+    )
+    parser.add_argument(
+        '--output_prefix',
+        type=str,
+        default='SNAP_daily_by_transect',
+        help='Prefix for output CSV filenames. Default: SNAP_daily_by_transect'
+    )
+
+    args = parser.parse_args()
+
+    # Construct filepaths from arguments
+    SNAP_DATA_DIR = args.snap_data_dir
+    TRANSECT_FILEPATH = args.transect_file
+    OUTPUT_DIR = args.output_dir
+    OUTPUT_PREFIX = args.output_prefix
+
+    # Construct individual data filepaths
+    TSK_FILEPATH = os.path.join(SNAP_DATA_DIR, 'tsk')
+    U10_FILEPATH = os.path.join(SNAP_DATA_DIR, 'u10')
+    V10_FILEPATH = os.path.join(SNAP_DATA_DIR, 'v10')
+    SEAICE_FILEPATH = os.path.join(SNAP_DATA_DIR, 'seaice')
+    PSFC_FILEPATH = os.path.join(SNAP_DATA_DIR, 'psfc')
+    T2_FILEPATH = os.path.join(SNAP_DATA_DIR, 't2')
+
+    # Check if directories exist
+    data_dirs = {
+        'tsk': TSK_FILEPATH,
+        'u10': U10_FILEPATH,
+        'v10': V10_FILEPATH,
+        'seaice': SEAICE_FILEPATH,
+        'psfc': PSFC_FILEPATH,
+        't2': T2_FILEPATH
+    }
+
+    missing_dirs = []
+    for name, path in data_dirs.items():
+        if not os.path.exists(path):
+            missing_dirs.append(f"  {name}: {path}")
+
+    if missing_dirs:
+        print("ERROR: The following SNAP data directories do not exist:", file=sys.stderr)
+        for dir_info in missing_dirs:
+            print(dir_info, file=sys.stderr)
+        print("\nPlease specify the correct path using --snap_data_dir argument.", file=sys.stderr)
+        print("Example: --snap_data_dir /path/to/snap_data", file=sys.stderr)
+        sys.exit(1)
+
+    if not os.path.exists(TRANSECT_FILEPATH):
+        print(f"ERROR: Transect file does not exist: {TRANSECT_FILEPATH}", file=sys.stderr)
+        print("Please specify the correct path using --transect_file argument.", file=sys.stderr)
+        sys.exit(1)
+
+    # Create output directory if it doesn't exist
+    if not os.path.exists(OUTPUT_DIR):
+        print(f"Creating output directory: {OUTPUT_DIR}")
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+
     # Import Transects and extract points
     transect_points, transects = get_transect_points(TRANSECT_FILEPATH)
 
@@ -415,4 +489,6 @@ def calculate_wind_data(dataframe):
             print(f'Unable to perform wind calculations for year {y}. Please check dataframe for missing data.')
         print(f'{y} Wind data processed. Writing final dataframe')
 
-        df.to_csv(OUTPUT_FILEPATH + f'_{y}.csv')
+        output_filename = f'{OUTPUT_PREFIX}_{y}.csv'
+        output_filepath = os.path.join(OUTPUT_DIR, output_filename)
+        df.to_csv(output_filepath)