From b64b81b7e0a5e8bf574cded14d666d3f9efc5f07 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Mon, 29 Dec 2025 11:52:55 -0500 Subject: [PATCH 1/2] Refactor: Replace hardcoded paths with argparse and os.path.join for cross-platform support - batchprocess.py: Replace all string concatenation with os.path.join() for cross-platform compatibility - process_snap_data.py: Remove hardcoded absolute paths, add argparse for configurable paths - Add --snap_data_dir, --transect_file, --output_dir, --output_prefix arguments - Default to relative paths (./data/snap_data, etc.) - Add directory existence checks with helpful error messages - Fix path concatenation in read_data() function This improves portability and makes the code work on Windows, macOS, and Linux without modification. --- batchprocess.py | 33 ++++++++------ process_snap_data.py | 106 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 110 insertions(+), 29 deletions(-) diff --git a/batchprocess.py b/batchprocess.py index 877ea30..ee2e20d 100644 --- a/batchprocess.py +++ b/batchprocess.py @@ -14,7 +14,7 @@ # TODO: Add method to determine if files have already been processed to prevent reprocessing # TODO: Add optional flag to reprocess all images files_to_be_processed = [] -for root, subdirs, files in os.walk(rootdir + "/data"): +for root, subdirs, files in os.walk(os.path.join(rootdir, "data")): # skip any files found in the output directory if root.find("output") != -1: continue @@ -22,13 +22,14 @@ if files: for f in files: if f.endswith(".tif") and f.find("_SR_") != -1: - pathname = root + "/" + f + pathname = os.path.join(root, f) files_to_be_processed.append((pathname, f)) # appending (path, filename) # if we have files, check output directory structure if files_to_be_processed: - if not os.path.isdir("data/output"): - os.mkdir("data/output") + output_dir = os.path.join("data", "output") + if not os.path.isdir(output_dir): + os.mkdir(output_dir) file_groups = {} @@ -52,10 +53,12 @@ # process directory structure if not already made - if not os.path.isdir("data/output/{}".format(file_year)): - os.mkdir("data/output/{}".format(file_year)) - if not os.path.isdir("data/output/{}/{}".format(file_year, file_month)): - os.mkdir("data/output/{}/{}".format(file_year, file_month)) + year_dir = os.path.join("data", "output", file_year) + if not os.path.isdir(year_dir): + os.mkdir(year_dir) + month_dir = os.path.join("data", "output", file_year, file_month) + if not os.path.isdir(month_dir): + os.mkdir(month_dir) # outfile_base = f[1][:15] + "_AnalyticMS_SR" @@ -78,19 +81,21 @@ "height": mosaic.shape[1], "width": mosaic.shape[2], "transform": out_transform}) - out_fp = "data/output/{year}/{month}/{year}{month}{day}_AnalyticMS_SR_merged.tif".format(year=year, month=month, day=day) + out_filename = "{year}{month}{day}_AnalyticMS_SR_merged.tif".format(year=year, month=month, day=day) + out_fp = os.path.join("data", "output", year, month, out_filename) with rasterio.open(out_fp, "w", **out_meta) as dst: dst.write(mosaic) # process the merged files -for root, subdirs, files in os.walk(rootdir + "/data/output"): +for root, subdirs, files in os.walk(os.path.join(rootdir, "data", "output")): if files: for f in files: if f.find("NDWI") != -1: continue - pathname = root + "/" + pathname = root outfile_base = f.split(sep=".")[0] - ndwi_outfile = pathname + outfile_base + "_NDWI.tif" - ndwi_class_outfile = pathname + outfile_base + "_NDWI_classified.tif" - ndwi = rt.calculate_ndwi(pathname + f, ndwi_outfile, plot=False) + ndwi_outfile = os.path.join(pathname, outfile_base + "_NDWI.tif") + ndwi_class_outfile = os.path.join(pathname, outfile_base + "_NDWI_classified.tif") + input_file = os.path.join(pathname, f) + ndwi = rt.calculate_ndwi(input_file, ndwi_outfile, plot=False) ndwi_class = rt.ndwi_classify(ndwi_outfile, ndwi_class_outfile, plot=False) diff --git a/process_snap_data.py b/process_snap_data.py index 1c2f06b..530e3c2 100644 --- a/process_snap_data.py +++ b/process_snap_data.py @@ -28,6 +28,8 @@ from datetime import datetime, date, timedelta import time import math +import argparse +import sys # WKT Used to convert Lat/Lon to projection used in snap data CRS_WKT = 'PROJCS["unnamed",GEOGCS["unnamed ellipse",DATUM["unknown",SPHEROID["unnamed",6370000,0]],PRIMEM["Greenwich",' \ @@ -39,19 +41,6 @@ # Center of region in lat/lon REGION_CENTER = (66.0756, -162.7172) -# Filepaths for SNAP data - -TSK_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/tsk' -U10_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/u10' -V10_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/v10' -SEAICE_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/seaice' -PSFC_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/psfc' -T2_FILEPATH = '/usr/local/coastal/snap_processing/snap_data/t2' -TRANSECT_FILEPATH = '/usr/local/coastal/snap_processing/transect_data/WestChukchi_exposed_STepr_rates.shp' - -# Filepath for I/O csv data -OUTPUT_FILEPATH = '/usr/local/coastal/snap_processing/snap_output/SNAP_daily_by_transect' - # Create global variables global transformer_to_snap_proj global transformer_to_lat_lon @@ -112,7 +101,7 @@ def read_data(filepath): if os.path.isdir(filepath): for filename in os.scandir(filepath): try: - data.append(nc.Dataset(filepath + '/' + filename.name)) + data.append(nc.Dataset(os.path.join(filepath, filename.name))) except OSError: pass @@ -288,6 +277,91 @@ def calculate_wind_data(dataframe): if __name__ == '__main__': + # Parse command-line arguments + parser = argparse.ArgumentParser( + description='Process SNAP climate data for coastline extraction', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + python process_snap_data.py --snap_data_dir ./data/snap_data --transect_file ./data/transects/WestChukchi_exposed_STepr_rates.shp --output_dir ./data/output + + python process_snap_data.py --snap_data_dir /usr/local/coastal/snap_processing/snap_data --transect_file /usr/local/coastal/snap_processing/transect_data/WestChukchi_exposed_STepr_rates.shp --output_dir /usr/local/coastal/snap_processing/snap_output + ''' + ) + parser.add_argument( + '--snap_data_dir', + type=str, + default='./data/snap_data', + help='Base directory containing SNAP data subdirectories (tsk, u10, v10, seaice, psfc, t2). Default: ./data/snap_data' + ) + parser.add_argument( + '--transect_file', + type=str, + default='./data/transect_data/WestChukchi_exposed_STepr_rates.shp', + help='Path to transect shapefile. Default: ./data/transect_data/WestChukchi_exposed_STepr_rates.shp' + ) + parser.add_argument( + '--output_dir', + type=str, + default='./data/snap_output', + help='Directory for output CSV files. Default: ./data/snap_output' + ) + parser.add_argument( + '--output_prefix', + type=str, + default='SNAP_daily_by_transect', + help='Prefix for output CSV filenames. Default: SNAP_daily_by_transect' + ) + + args = parser.parse_args() + + # Construct filepaths from arguments + SNAP_DATA_DIR = args.snap_data_dir + TRANSECT_FILEPATH = args.transect_file + OUTPUT_DIR = args.output_dir + OUTPUT_PREFIX = args.output_prefix + + # Construct individual data filepaths + TSK_FILEPATH = os.path.join(SNAP_DATA_DIR, 'tsk') + U10_FILEPATH = os.path.join(SNAP_DATA_DIR, 'u10') + V10_FILEPATH = os.path.join(SNAP_DATA_DIR, 'v10') + SEAICE_FILEPATH = os.path.join(SNAP_DATA_DIR, 'seaice') + PSFC_FILEPATH = os.path.join(SNAP_DATA_DIR, 'psfc') + T2_FILEPATH = os.path.join(SNAP_DATA_DIR, 't2') + + # Check if directories exist + data_dirs = { + 'tsk': TSK_FILEPATH, + 'u10': U10_FILEPATH, + 'v10': V10_FILEPATH, + 'seaice': SEAICE_FILEPATH, + 'psfc': PSFC_FILEPATH, + 't2': T2_FILEPATH + } + + missing_dirs = [] + for name, path in data_dirs.items(): + if not os.path.exists(path): + missing_dirs.append(f" {name}: {path}") + + if missing_dirs: + print("ERROR: The following SNAP data directories do not exist:", file=sys.stderr) + for dir_info in missing_dirs: + print(dir_info, file=sys.stderr) + print("\nPlease specify the correct path using --snap_data_dir argument.", file=sys.stderr) + print("Example: --snap_data_dir /path/to/snap_data", file=sys.stderr) + sys.exit(1) + + if not os.path.exists(TRANSECT_FILEPATH): + print(f"ERROR: Transect file does not exist: {TRANSECT_FILEPATH}", file=sys.stderr) + print("Please specify the correct path using --transect_file argument.", file=sys.stderr) + sys.exit(1) + + # Create output directory if it doesn't exist + if not os.path.exists(OUTPUT_DIR): + print(f"Creating output directory: {OUTPUT_DIR}") + os.makedirs(OUTPUT_DIR, exist_ok=True) + # Import Transects and extract points transect_points, transects = get_transect_points(TRANSECT_FILEPATH) @@ -415,4 +489,6 @@ def calculate_wind_data(dataframe): print(f'Unable to perform wind calculations for year {y}. Please check dataframe for missing data.') print(f'{y} Wind data processed. Writing final dataframe') - df.to_csv(OUTPUT_FILEPATH + f'_{y}.csv') + output_filename = f'{OUTPUT_PREFIX}_{y}.csv' + output_filepath = os.path.join(OUTPUT_DIR, output_filename) + df.to_csv(output_filepath) From d39dda6588cd6e78ede76263bb7e53616927b8cc Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Mon, 29 Dec 2025 12:35:21 -0500 Subject: [PATCH 2/2] Refactor: Fix path concatenation issues in data_preprocessing.py - Replace string concatenation with os.path.join() in augment_tiles() - Add argparse for configurable input/output directories - Make make_tiles() accept configurable output_dir parameter - Add directory existence checks with helpful error messages - Add --augment flag to optionally run tile augmentation - Add additional command-line options for tile dimensions and processing This improves cross-platform compatibility and makes the script more flexible. --- data_preprocessing.py | 134 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 116 insertions(+), 18 deletions(-) diff --git a/data_preprocessing.py b/data_preprocessing.py index d51d2ff..c07ab55 100644 --- a/data_preprocessing.py +++ b/data_preprocessing.py @@ -6,9 +6,11 @@ import numpy as np import os import glob +import argparse +import sys # adapted from https://gis.stackexchange.com/questions/285499/how-to-split-multiband-image-into-image-tiles-using-rasterio -def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False): +def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False, output_dir="data/tiles"): with rio.open(image) as src: filepath, filename = os.path.split(image) file_base, file_extension = os.path.splitext(filename) @@ -32,7 +34,9 @@ def make_tiles(image, tile_height=512, tile_width=512, skip_no_data=False): if 0 in window_data[..., :-1]: continue out_name = file_base + "_" + str(i + 1).zfill(2) + "-of-" + str(len(tiles)) + file_extension - out_path = os.path.join("data/tiles/", out_name) + out_path = os.path.join(output_dir, out_name) + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) with rio.open(out_path, 'w', **meta) as dst: dst.write(src.read(window=window)) @@ -48,20 +52,26 @@ def _flip_bands(bands): # takes the path to all image tiles and creates tiles that are rotated 90°, 180° and 270° as well as their flipped counterparts # this results in 8 tiles for every input tile (including the input tile) def augment_tiles(tile_path): - files = glob.glob(tile_path + "*.tif") - files = set(files) - set(glob.glob(tile_path + "*rot*")) - files = set(files) - set(glob.glob(tile_path + "*flip*")) + # Normalize tile_path and ensure it's a directory path + tile_path = os.path.normpath(tile_path) + if not os.path.isdir(tile_path): + raise ValueError(f"Tile path is not a valid directory: {tile_path}") + + # Use os.path.join for glob patterns (works cross-platform) + files = glob.glob(os.path.join(tile_path, "*.tif")) + files = set(files) - set(glob.glob(os.path.join(tile_path, "*rot*"))) + files = set(files) - set(glob.glob(os.path.join(tile_path, "*flip*"))) for file in files: filename = os.path.basename(file) file_base, file_extension = os.path.splitext(filename) # generating filepaths for new tiles - path_90 = tile_path + file_base + "_rot90" + file_extension - path_180 = tile_path + file_base + "_rot180" + file_extension - path_270 = tile_path + file_base + "_rot270" + file_extension - path_flip_name = tile_path + file_base + "_flip" + file_extension - path_flip_90 = tile_path + file_base + "_rot90_flip" + file_extension - path_flip_180 = tile_path + file_base + "_rot180_flip" + file_extension - path_flip_270 = tile_path + file_base + "_rot270_flip" + file_extension + path_90 = os.path.join(tile_path, file_base + "_rot90" + file_extension) + path_180 = os.path.join(tile_path, file_base + "_rot180" + file_extension) + path_270 = os.path.join(tile_path, file_base + "_rot270" + file_extension) + path_flip_name = os.path.join(tile_path, file_base + "_flip" + file_extension) + path_flip_90 = os.path.join(tile_path, file_base + "_rot90_flip" + file_extension) + path_flip_180 = os.path.join(tile_path, file_base + "_rot180_flip" + file_extension) + path_flip_270 = os.path.join(tile_path, file_base + "_rot270_flip" + file_extension) with rio.open(file, driver="GTiff") as src: # band_1 = src.read(1) @@ -85,10 +95,98 @@ def augment_tiles(tile_path): # example usage if __name__ == '__main__': - files = glob.glob("data/labeled_inputs/*.tif") - + parser = argparse.ArgumentParser( + description='Preprocess images by creating tiles and augmenting them', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=''' +Examples: + python data_preprocessing.py --input_dir data/labeled_inputs --output_dir data/tiles + + python data_preprocessing.py --input_dir ./images --output_dir ./tiles --augment + ''' + ) + parser.add_argument( + '--input_dir', + type=str, + default='data/labeled_inputs', + help='Directory containing input images to tile. Default: data/labeled_inputs' + ) + parser.add_argument( + '--output_dir', + type=str, + default='data/tiles', + help='Directory for output tiles. Default: data/tiles' + ) + parser.add_argument( + '--augment', + action='store_true', + help='Run tile augmentation after creating tiles' + ) + parser.add_argument( + '--tile_height', + type=int, + default=512, + help='Height of tiles in pixels. Default: 512' + ) + parser.add_argument( + '--tile_width', + type=int, + default=512, + help='Width of tiles in pixels. Default: 512' + ) + parser.add_argument( + '--skip_no_data', + action='store_true', + help='Skip tiles with no data values' + ) + parser.add_argument( + '--max_workers', + type=int, + default=6, + help='Maximum number of worker threads. Default: 6' + ) + + args = parser.parse_args() + + # Check if input directory exists + if not os.path.isdir(args.input_dir): + print(f"ERROR: Input directory does not exist: {args.input_dir}", file=sys.stderr) + print("Please specify a valid directory using --input_dir argument.", file=sys.stderr) + sys.exit(1) + + # Create output directory if it doesn't exist + os.makedirs(args.output_dir, exist_ok=True) + + # Find all .tif files in input directory + input_pattern = os.path.join(args.input_dir, "*.tif") + files = glob.glob(input_pattern) + + if not files: + print(f"No .tif files found in {args.input_dir}", file=sys.stderr) + sys.exit(1) + + print(f"Found {len(files)} files to process") + print(f"Output directory: {args.output_dir}") + from concurrent.futures import ThreadPoolExecutor - with ThreadPoolExecutor(max_workers=6) as p: - p.map(make_tiles, files) - -# augment_tiles("data/tiles/") \ No newline at end of file + + # Create tiles + def process_file(file): + make_tiles( + file, + tile_height=args.tile_height, + tile_width=args.tile_width, + skip_no_data=args.skip_no_data, + output_dir=args.output_dir + ) + + with ThreadPoolExecutor(max_workers=args.max_workers) as p: + p.map(process_file, files) + + print("Tile creation complete!") + + # Augment tiles if requested + if args.augment: + print("Starting tile augmentation...") + augment_tiles(args.output_dir) + print("Tile augmentation complete!") \ No newline at end of file