From 365bbf4155576e5846c280abf5a2daa561a5a753 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Mon, 19 Dec 2022 16:50:50 -0600 Subject: [PATCH 01/11] Add test script to process nwm forcing into ngen input --- ngen_forcing/process_nwm_forcing.py | 68 +++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 ngen_forcing/process_nwm_forcing.py diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py new file mode 100644 index 0000000..a9e1ec8 --- /dev/null +++ b/ngen_forcing/process_nwm_forcing.py @@ -0,0 +1,68 @@ +# import rioxarray as rxr +import xarray as xr +import geopandas as gpd +from rasterstats import zonal_stats +# import rasterio +import pandas as pd + +import warnings +warnings.simplefilter("ignore") + +# Read forcing files +# Generate List of files + +# TODO: Add looping through lists of forcing files +# consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. +# Integer values for runinput, varinput, etc. are listed at the top of the file +# and an example is given in the `main` function. + +# import listofnwmfilenames +# create_file_list( +# runinput, +# varinput, +# geoinput, +# meminput, +# start_date, +# end_date, +# fcst_cycle, +# ) + +#for file in list_of_files: +fc_nc = ("nwm.t00z.medium_range.forcing.f001.conus.nc") +fc_xds = xr.open_dataset(fc_nc) + +reng = "rasterio" +fc_xds = xr.open_dataset(fc_nc, engine=reng) + +# Read basin boundary file +f_03 = "03w/nextgen_03W.gpkg" + +gpkg_03w_divides = gpd.read_file(f_03, layer="divides") + +list_of_vars = [ + "U2D" , + "V2D" , + "LWDOWN" , + "RAINRATE", + "T2D" , + "Q2D" , + "PSFC" , + "SWDOWN", +] + +df_dict = {} +for _v in list_of_vars: + with xr.open_dataset(fc_nc, engine=reng) as _xds: + _src = _xds[_v] + _aff2 = _src.rio.transform() + _arr2 = _src.values[0] + _df_zonal_stats = pd.DataFrame(zonal_stats(gpkg_03w_divides, _arr2, affine=_aff2)) + + df_dict[_v] = pd.DataFrame(index=_df_zonal_stats.index) + # adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_03w_divides, _df_zonal_stats], axis=1) + df_dict[_v][fc_xds.time.values[0]]=_df_zonal_stats["mean"] + +# TODO: Convert the output to CSV with something like +# `gdf3.to_csv` + From 58381b4aa21490034438cecff52a8b7b76616435 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Tue, 20 Dec 2022 12:11:54 -0600 Subject: [PATCH 02/11] rewrite to include file-loop --- ngen_forcing/process_nwm_forcing.py | 52 +++++++++++++++++++---------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py index a9e1ec8..d3a3389 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/process_nwm_forcing.py @@ -5,6 +5,7 @@ # import rasterio import pandas as pd +from pathlib import Path import warnings warnings.simplefilter("ignore") @@ -27,18 +28,23 @@ # fcst_cycle, # ) -#for file in list_of_files: -fc_nc = ("nwm.t00z.medium_range.forcing.f001.conus.nc") -fc_xds = xr.open_dataset(fc_nc) +""" +A set of test files can be generated downloading these files +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc +wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc +""" -reng = "rasterio" -fc_xds = xr.open_dataset(fc_nc, engine=reng) +folder_prefix = Path('data') +list_of_files = [ + "nwm.t12z.medium_range.forcing.f001.conus.nc", + "nwm.t12z.medium_range.forcing.f002.conus.nc", + "nwm.t12z.medium_range.forcing.f003.conus.nc", +] # Read basin boundary file f_03 = "03w/nextgen_03W.gpkg" - -gpkg_03w_divides = gpd.read_file(f_03, layer="divides") - +gpkg_divides = gpd.read_file(f_03, layer="divides") list_of_vars = [ "U2D" , "V2D" , @@ -52,16 +58,26 @@ df_dict = {} for _v in list_of_vars: - with xr.open_dataset(fc_nc, engine=reng) as _xds: - _src = _xds[_v] - _aff2 = _src.rio.transform() - _arr2 = _src.values[0] - _df_zonal_stats = pd.DataFrame(zonal_stats(gpkg_03w_divides, _arr2, affine=_aff2)) - - df_dict[_v] = pd.DataFrame(index=_df_zonal_stats.index) - # adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_03w_divides, _df_zonal_stats], axis=1) - df_dict[_v][fc_xds.time.values[0]]=_df_zonal_stats["mean"] + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + +reng = "rasterio" +sum_stat = "mean" + +for _nc_file in list_of_files: + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = (folder_prefix.joinpath(_nc_file)) + + with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + for _v in list_of_vars: + _src = _xds[_v] + _aff2 = _src.rio.transform() + _arr2 = _src.values[0] + + breakpoint() + _df_zonal_stats = pd.DataFrame(zonal_stats(gpkg_divides, _arr2, affine=_aff2)) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + df_dict[_v][_xds.time.values[0]]=_df_zonal_stats[sum_stat] # TODO: Convert the output to CSV with something like # `gdf3.to_csv` From b3c024d83bdca8a9bc06f611f91c89506f3beef3 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Tue, 20 Dec 2022 12:16:08 -0600 Subject: [PATCH 03/11] include gpkg download in example --- ngen_forcing/process_nwm_forcing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py index d3a3389..d7d86ae 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/process_nwm_forcing.py @@ -33,6 +33,7 @@ wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f001.conus.nc wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f002.conus.nc wget -P data -c https://storage.googleapis.com/national-water-model/nwm.20220824/forcing_medium_range/nwm.t12z.medium_range.forcing.f003.conus.nc +wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg """ folder_prefix = Path('data') From 9c35af78e6eb898fe195902aba3927e0deae7709 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Thu, 22 Dec 2022 12:17:22 -0600 Subject: [PATCH 04/11] add optimized loop and port to functions --- .../TestConvert_NWMForcing_to_Ngen.py | 178 ++++++++++++++++++ ngen_forcing/defs.py | 18 ++ ngen_forcing/process_nwm_forcing.py | 167 +++++++++++----- 3 files changed, 319 insertions(+), 44 deletions(-) create mode 100644 ngen_forcing/TestConvert_NWMForcing_to_Ngen.py create mode 100644 ngen_forcing/defs.py diff --git a/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py b/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py new file mode 100644 index 0000000..8f4158f --- /dev/null +++ b/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py @@ -0,0 +1,178 @@ +from defs import xr_read_window, polymask +from rasterio import _io, windows +import concurrent.futures +import xarray as xr + + +class MemoryDataset(_io.MemoryDataset, windows.WindowMethodsMixin): + pass + + +def junk(): + + flist = gpkg_divides.geometry.to_list() + polys = flist + + +def get_forcing_dict_newway( + feature_list, + folder_prefix, + file_list, +): + + reng = "rasterio" + _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + + filehandles = [xr.open_dataset("data/" + f) for f in file_list] + stats = [] + for i, m in enumerate(map(polymask(_u2d), feature_list)): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = m + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + for f in filehandles: + cropped = xr_read_window(f, winslices, mask=mask) + stats.append(cropped.mean()) + [f.close() for f in filehandles] # Returns None for each file + + return stats + + +def get_forcing_dict_newway_parallel( + feature_list, + folder_prefix, + file_list, + ): + + reng = "rasterio" + _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds.U2D.values + + _u2d = MemoryDataset( + _template_arr, + transform=_xds.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + filehandles = [xr.open_dataset("data/" + f) for f in file_list] + + with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: + # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + stats = [] + future_list = [] + + for i, m in enumerate(map(polymask(_u2d), feature_list)): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = m + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + for f in filehandles: + future = executor.submit(xr_read_window, f, winslices, mask=mask) + # cropped = xr_read_window(f, winslices, mask=mask) + # stats.append(cropped.mean()) + future_list.append(future) + for _f in concurrent.futures.as_completed(future_list): + stats.append(_f.result().mean()) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_newway_inverted( + feature_list, + folder_prefix, + file_list, +): + + reng = "rasterio" + _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + + filehandles = [xr.open_dataset("data/" + f) for f in file_list] + stats = [] + future_list = [] + mask_win_list = [] + + for i, m in enumerate(map(polymask(_u2d), feature_list)): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = m + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + mask_win_list.append((mask, winslices)) + + for f in filehandles: + print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") + for _m, _w in mask_win_list: + cropped = xr_read_window(f, _w, mask=_m) + stats.append(cropped.mean()) + + [f.close() for f in filehandles] + return stats + + +def get_forcing_dict_newway_inverted_parallel( + feature_list, + folder_prefix, + file_list, + ): + + import concurrent.futures + + reng = "rasterio" + _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) + _template_arr = _xds.U2D.values + _u2d = MemoryDataset( + _template_arr, + transform=_xds.U2D.rio.transform(), + gcps=None, + rpcs=None, + crs=None, + copy=False, + ) + + mask_win_list = [] + for i, m in enumerate(map(polymask(_u2d), feature_list)): + print(f"{i}, {round(i/len(feature_list), 5)*100}".ljust(40), end="\r") + mask, _, window = m + mask = xr.DataArray(mask, dims=["y", "x"]) + winslices = dict(zip(["y", "x"], window.toslices())) + mask_win_list.append((mask, winslices)) + + filehandles = [xr.open_dataset("data/" + f) for f in file_list] + stats = [] + future_list = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: + # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + + for f in filehandles: + print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") + for _m, _w in mask_win_list: + future = executor.submit(xr_read_window, f, _w, mask=_m) + # cropped = xr_read_window(f, _w, mask=_m) + # stats.append(cropped.mean()) + future_list.append(future) + for _f in concurrent.futures.as_completed(future_list): + stats.append(_f.result().mean()) + + [f.close() for f in filehandles] + return stats diff --git a/ngen_forcing/defs.py b/ngen_forcing/defs.py new file mode 100644 index 0000000..58b2c1d --- /dev/null +++ b/ngen_forcing/defs.py @@ -0,0 +1,18 @@ +import rasterio.mask as riomask + + +def polymask(dataset, invert=False, all_touched=False): + def _polymask(poly): + return riomask.raster_geometry_mask( + dataset, [poly], invert=invert, all_touched=all_touched, crop=True + ) + + return _polymask + + +def xr_read_window(ds, window, mask=None): + data = ds.isel(window) + if mask is None: + return data + else: + return data.where(mask) diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py index d7d86ae..20a2300 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/process_nwm_forcing.py @@ -2,11 +2,22 @@ import xarray as xr import geopandas as gpd from rasterstats import zonal_stats + # import rasterio import pandas as pd +import time + +from TestConvert_NWMForcing_to_Ngen import ( + get_forcing_dict_newway, + get_forcing_dict_newway_parallel, + get_forcing_dict_newway_inverted, + get_forcing_dict_newway_inverted_parallel, +) + from pathlib import Path import warnings + warnings.simplefilter("ignore") # Read forcing files @@ -15,7 +26,7 @@ # TODO: Add looping through lists of forcing files # consider looking at the "listofnwmfilenames.py" in the data_access_examples repository. # Integer values for runinput, varinput, etc. are listed at the top of the file -# and an example is given in the `main` function. +# and an example is given in the `main` function. # import listofnwmfilenames # create_file_list( @@ -36,50 +47,118 @@ wget -P 03w -c https://nextgen-hydrofabric.s3.amazonaws.com/v1.2/nextgen_03W.gpkg """ -folder_prefix = Path('data') -list_of_files = [ - "nwm.t12z.medium_range.forcing.f001.conus.nc", - "nwm.t12z.medium_range.forcing.f002.conus.nc", - "nwm.t12z.medium_range.forcing.f003.conus.nc", -] - -# Read basin boundary file -f_03 = "03w/nextgen_03W.gpkg" -gpkg_divides = gpd.read_file(f_03, layer="divides") -list_of_vars = [ - "U2D" , - "V2D" , - "LWDOWN" , - "RAINRATE", - "T2D" , - "Q2D" , - "PSFC" , - "SWDOWN", -] - -df_dict = {} -for _v in list_of_vars: - df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) - -reng = "rasterio" -sum_stat = "mean" - -for _nc_file in list_of_files: - # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") - _full_nc_file = (folder_prefix.joinpath(_nc_file)) - - with xr.open_dataset(_full_nc_file, engine=reng) as _xds: - for _v in list_of_vars: - _src = _xds[_v] - _aff2 = _src.rio.transform() - _arr2 = _src.values[0] - - breakpoint() - _df_zonal_stats = pd.DataFrame(zonal_stats(gpkg_divides, _arr2, affine=_aff2)) - # if adding statistics back to original GeoDataFrame - # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) - df_dict[_v][_xds.time.values[0]]=_df_zonal_stats[sum_stat] + +def get_forcing_dict( + gpkg_divides, + folder_prefix, + filelist, + var_list, +): + reng = "rasterio" + sum_stat = "mean" + + df_dict = {} + for _v in var_list: + df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + + for _nc_file in filelist: + # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") + _full_nc_file = folder_prefix.joinpath(_nc_file) + + with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + for _v in var_list: + _src = _xds[_v] + _aff2 = _src.rio.transform() + _arr2 = _src.values[0] + + _df_zonal_stats = pd.DataFrame( + zonal_stats(gpkg_divides, _arr2, affine=_aff2) + ) + # if adding statistics back to original GeoDataFrame + # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) + df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] + + return df_dict + # TODO: Convert the output to CSV with something like # `gdf3.to_csv` + +def main(): + + folder_prefix = Path("data") + list_of_files = [ + f"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc" for _r in range(1, 241) + ] + + # Read basin boundary file + f_03 = "03w/nextgen_03W.gpkg" + gpkg_divides = gpd.read_file(f_03, layer="divides") + var_list = [ + "U2D", + "V2D", + "LWDOWN", + "RAINRATE", + "T2D", + "Q2D", + "PSFC", + "SWDOWN", + ] + + file_list = list_of_files[0:30] + gpkg_subset = gpkg_divides[0:2000] + #file_list = list_of_files[0:3] + #gpkg_subset = gpkg_divides[0:20] + feature_list = gpkg_subset.geometry.to_list() + + # start_time = time.time() + # print(f"Working on the original way") + # forcing_dict = get_forcing_dict( + # gpkg_subset, + # folder_prefix, + # file_list, + # var_list, + # ) + # print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way") + fd2 = get_forcing_dict_newway( + feature_list, + folder_prefix, + file_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with threading. (It's not much better.)") + fd3 = get_forcing_dict_newway_parallel( + feature_list, + folder_prefix, + file_list, + ) + print(time.time() - start_time) + + + start_time = time.time() + print(f"Working on the new way with loops reversed.") + fd4 = get_forcing_dict_newway_inverted( + feature_list, + folder_prefix, + file_list, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with loops reversed with threading.") + fd4 = get_forcing_dict_newway_inverted_parallel( + feature_list, + folder_prefix, + file_list, + ) + print(time.time() - start_time) + + +if __name__ == "__main__": + main() From e90c81b54b53811d66ae6691672fea3dc0afd3b3 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Thu, 22 Dec 2022 12:51:18 -0600 Subject: [PATCH 05/11] add parallel options --- .../TestConvert_NWMForcing_to_Ngen.py | 40 +++++++++++++------ ngen_forcing/process_nwm_forcing.py | 39 ++++++++++++++---- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py b/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py index 8f4158f..2a34f9a 100644 --- a/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py +++ b/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py @@ -48,10 +48,12 @@ def get_forcing_dict_newway( def get_forcing_dict_newway_parallel( - feature_list, - folder_prefix, - file_list, - ): + feature_list, + folder_prefix, + file_list, + para="thread", + para_n=2, +): reng = "rasterio" _xds = xr.open_dataset(folder_prefix.joinpath(file_list[0]), engine=reng) @@ -67,8 +69,14 @@ def get_forcing_dict_newway_parallel( ) filehandles = [xr.open_dataset("data/" + f) for f in file_list] - with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: - # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + if para == "process": + pool = concurrent.futures.ProcessPoolExecutor + elif para == "thread": + pool = concurrent.futures.ThreadPoolExecutor + else: + pool = concurrent.futures.ThreadPoolExecutor + + with pool(max_workers=para_n) as executor: stats = [] future_list = [] @@ -130,10 +138,12 @@ def get_forcing_dict_newway_inverted( def get_forcing_dict_newway_inverted_parallel( - feature_list, - folder_prefix, - file_list, - ): + feature_list, + folder_prefix, + file_list, + para="thread", + para_n=2, +): import concurrent.futures @@ -161,8 +171,14 @@ def get_forcing_dict_newway_inverted_parallel( stats = [] future_list = [] - with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: - # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + if para == "process": + pool = concurrent.futures.ProcessPoolExecutor + elif para == "thread": + pool = concurrent.futures.ThreadPoolExecutor + else: + pool = concurrent.futures.ThreadPoolExecutor + + with pool(max_workers=para_n) as executor: for f in filehandles: print(f"{i}, {round(i/len(file_list), 2)*100}".ljust(40), end="\r") diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py index 20a2300..5e1430a 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/process_nwm_forcing.py @@ -106,10 +106,10 @@ def main(): "SWDOWN", ] - file_list = list_of_files[0:30] - gpkg_subset = gpkg_divides[0:2000] - #file_list = list_of_files[0:3] - #gpkg_subset = gpkg_divides[0:20] + # file_list = list_of_files[0:30] + # gpkg_subset = gpkg_divides[0:2000] + file_list = list_of_files[0:3] + gpkg_subset = gpkg_divides[0:200] feature_list = gpkg_subset.geometry.to_list() # start_time = time.time() @@ -132,14 +132,26 @@ def main(): print(time.time() - start_time) start_time = time.time() - print(f"Working on the new way with threading. (It's not much better.)") + print(f"Working on the new way with threading parallel.") fd3 = get_forcing_dict_newway_parallel( feature_list, folder_prefix, file_list, - ) + para="thread", + para_n=16, + ) print(time.time() - start_time) + start_time = time.time() + print(f"Working on the new way with process parallel.") + fd3 = get_forcing_dict_newway_parallel( + feature_list, + folder_prefix, + file_list, + para="process", + para_n=16, + ) + print(time.time() - start_time) start_time = time.time() print(f"Working on the new way with loops reversed.") @@ -151,11 +163,24 @@ def main(): print(time.time() - start_time) start_time = time.time() - print(f"Working on the new way with loops reversed with threading.") + print(f"Working on the new way with loops reversed with threading parallel.") + fd4 = get_forcing_dict_newway_inverted_parallel( + feature_list, + folder_prefix, + file_list, + para="thread", + para_n=16, + ) + print(time.time() - start_time) + + start_time = time.time() + print(f"Working on the new way with loops reversed with process parallel.") fd4 = get_forcing_dict_newway_inverted_parallel( feature_list, folder_prefix, file_list, + para="process", + para_n=16, ) print(time.time() - start_time) From d1cf4282388f877d117ffc150740934eb51eaa35 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Mon, 2 Jan 2023 14:45:09 -0600 Subject: [PATCH 06/11] explain commented test block --- ngen_forcing/process_nwm_forcing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/process_nwm_forcing.py index 5e1430a..2232691 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/process_nwm_forcing.py @@ -112,6 +112,8 @@ def main(): gpkg_subset = gpkg_divides[0:200] feature_list = gpkg_subset.geometry.to_list() + # This way is extremely slow for anything more than a + # few files, so we comment it out of the test # start_time = time.time() # print(f"Working on the original way") # forcing_dict = get_forcing_dict( From 1f2c9906233d45aeefc43101a934915cfd9ba756 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Mon, 2 Jan 2023 14:57:45 -0600 Subject: [PATCH 07/11] better file names for process script --- ...ert_NWMForcing_to_Ngen.py => process_nwm_forcing_to_ngen.py} | 0 ...ocess_nwm_forcing.py => test_process_nwm_forcing_to_ngen.py} | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename ngen_forcing/{TestConvert_NWMForcing_to_Ngen.py => process_nwm_forcing_to_ngen.py} (100%) rename ngen_forcing/{process_nwm_forcing.py => test_process_nwm_forcing_to_ngen.py} (99%) diff --git a/ngen_forcing/TestConvert_NWMForcing_to_Ngen.py b/ngen_forcing/process_nwm_forcing_to_ngen.py similarity index 100% rename from ngen_forcing/TestConvert_NWMForcing_to_Ngen.py rename to ngen_forcing/process_nwm_forcing_to_ngen.py diff --git a/ngen_forcing/process_nwm_forcing.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py similarity index 99% rename from ngen_forcing/process_nwm_forcing.py rename to ngen_forcing/test_process_nwm_forcing_to_ngen.py index 2232691..5850bfb 100644 --- a/ngen_forcing/process_nwm_forcing.py +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -8,7 +8,7 @@ import time -from TestConvert_NWMForcing_to_Ngen import ( +from process_nwm_forcing_to_ngen import ( get_forcing_dict_newway, get_forcing_dict_newway_parallel, get_forcing_dict_newway_inverted, From 1039630130925577a8b857db1f8c2959b758c5a6 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Fri, 27 Jan 2023 13:48:06 -0600 Subject: [PATCH 08/11] turn on slow test in committed file --- .../test_process_nwm_forcing_to_ngen.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py index 5850bfb..a7d1404 100644 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -114,15 +114,15 @@ def main(): # This way is extremely slow for anything more than a # few files, so we comment it out of the test - # start_time = time.time() - # print(f"Working on the original way") - # forcing_dict = get_forcing_dict( - # gpkg_subset, - # folder_prefix, - # file_list, - # var_list, - # ) - # print(time.time() - start_time) + start_time = time.time() + print(f"Working on the old (slow) way") + fd1 = get_forcing_dict( + gpkg_subset, + folder_prefix, + file_list, + var_list, + ) + print(time.time() - start_time) start_time = time.time() print(f"Working on the new way") From 4ad394c88382dc919de8ce7401587d7d0ff6eeee Mon Sep 17 00:00:00 2001 From: James Halgren Date: Mon, 30 Jan 2023 11:55:36 -0600 Subject: [PATCH 09/11] hold file handles for slow method --- ngen_forcing/test_process_nwm_forcing_to_ngen.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ngen_forcing/test_process_nwm_forcing_to_ngen.py b/ngen_forcing/test_process_nwm_forcing_to_ngen.py index a7d1404..5fe24a3 100644 --- a/ngen_forcing/test_process_nwm_forcing_to_ngen.py +++ b/ngen_forcing/test_process_nwm_forcing_to_ngen.py @@ -61,11 +61,16 @@ def get_forcing_dict( for _v in var_list: df_dict[_v] = pd.DataFrame(index=gpkg_divides.index) + ds_list = [] for _nc_file in filelist: # _nc_file = ("nwm.t00z.medium_range.forcing.f001.conus.nc") _full_nc_file = folder_prefix.joinpath(_nc_file) + ds_list.append(xr.open_dataset(_full_nc_file, engine=reng)) - with xr.open_dataset(_full_nc_file, engine=reng) as _xds: + for _i, _nc_file in enumerate(filelist): + _xds = ds_list[_i] + print(f"{_i}, {round(_i/len(filelist), 5)*100}".ljust(40), end="\r") + if 1 == 1: for _v in var_list: _src = _xds[_v] _aff2 = _src.rio.transform() @@ -78,6 +83,8 @@ def get_forcing_dict( # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1) df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[sum_stat] + [_xds.close() for _xds in ds_list] + return df_dict From 180a81720daa2b237126436fc2df66ce15bda744 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Wed, 8 Feb 2023 16:21:37 -0600 Subject: [PATCH 10/11] add notebook with 240hour forcing buildup --- .../VERYROUGH_RTI_Forcing_example.ipynb | 1021 +++++++++++++++++ 1 file changed, 1021 insertions(+) create mode 100644 ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb diff --git a/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb b/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb new file mode 100644 index 0000000..2ff823e --- /dev/null +++ b/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb @@ -0,0 +1,1021 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 124, + "id": "c4b19730-d455-418f-a3a3-15efec3a8387", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: geoparquet in /Users/jshalgren/pyenv310/lib/python3.10/site-packages (0.0.3)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "# !pip install config\n", + "!pip install geoparquet \n", + "import geoparquet as gpq\n", + "import pickle\n", + "# import config\n", + "import os\n", + "import gc\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr\n", + "import rasterio as rio\n", + "\n", + "from rasterio.io import MemoryFile\n", + "\n", + "from rasterio.features import rasterize\n", + "# from utils import parquet_to_gdf\n", + "# from Grid_to_parquet import get_dataset\n", + "\n", + "TEMPLATE_BLOB_NAME = \"nwm.20221001/forcing_medium_range/nwm.t00z.medium_range.forcing.f001.conus.nc\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "e9159453-76c6-4bbe-b08f-a746a54f48f5", + "metadata": {}, + "outputs": [], + "source": [ + "NWM_BUCKET = \"national-water-model\"\n", + "\n", + "# WKT strings extracted from NWM grids\n", + "CONUS_NWM_WKT = 'PROJCS[\"Lambert_Conformal_Conic\",GEOGCS[\"GCS_Sphere\",DATUM[\"D_Sphere\",SPHEROID[\"Sphere\",6370000.0,0.0]], \\\n", + "PRIMEM[\"Greenwich\",0.0],UNIT[\"Degree\",0.0174532925199433]],PROJECTION[\"Lambert_Conformal_Conic_2SP\"],PARAMETER[\"false_easting\",0.0],\\\n", + "PARAMETER[\"false_northing\",0.0],PARAMETER[\"central_meridian\",-97.0],PARAMETER[\"standard_parallel_1\",30.0],\\\n", + "PARAMETER[\"standard_parallel_2\",60.0],PARAMETER[\"latitude_of_origin\",40.0],UNIT[\"Meter\",1.0]]'\n", + "\n", + "HI_NWM_WKT = 'PROJCS[\"Lambert_Conformal_Conic\",GEOGCS[\"GCS_Sphere\",DATUM[\"D_Sphere\",SPHEROID[\"Sphere\",6370000.0,0.0]],\\\n", + "PRIMEM[\"Greenwich\",0.0],UNIT[\"Degree\",0.0174532925199433]],PROJECTION[\"Lambert_Conformal_Conic_2SP\"],PARAMETER[\"false_easting\",0.0],\\\n", + "PARAMETER[\"false_northing\",0.0],PARAMETER[\"central_meridian\",-157.42],PARAMETER[\"standard_parallel_1\",10.0],\\\n", + "PARAMETER[\"standard_parallel_2\",30.0],PARAMETER[\"latitude_of_origin\",20.6],UNIT[\"Meter\",1.0]]'\n", + "\n", + "PR_NWM_WKT = 'PROJCS[\"Sphere_Lambert_Conformal_Conic\",GEOGCS[\"GCS_Sphere\",DATUM[\"D_Sphere\",SPHEROID[\"Sphere\",6370000.0,0.0]],\\\n", + "PRIMEM[\"Greenwich\",0.0],UNIT[\"Degree\",0.0174532925199433]],PROJECTION[\"Lambert_Conformal_Conic_2SP\"],PARAMETER[\"false_easting\",0.0],\\\n", + "PARAMETER[\"false_northing\",0.0],PARAMETER[\"central_meridian\",-65.91],PARAMETER[\"standard_parallel_1\",18.1],\\\n", + "PARAMETER[\"standard_parallel_2\",18.1],PARAMETER[\"latitude_of_origin\",18.1],UNIT[\"Meter\",1.0]]'" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "eaba8b05-2b98-4aea-974f-d26de67f729b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "\n", + "CACHE_DIR = Path(\".\", \"data_3\", \"cache\")\n", + "NWM_CACHE_DIR = os.path.join(CACHE_DIR, \"nwm\")\n", + "USGS_CACHE_DIR = os.path.join(CACHE_DIR, \"usgs\")\n", + "GEO_CACHE_DIR = os.path.join(CACHE_DIR, \"geo\")\n", + "\n", + "NWM_CACHE_H5 = os.path.join(NWM_CACHE_DIR, \"gcp_client.h5\")\n", + "\n", + "PARQUET_CACHE_DIR = os.path.join(CACHE_DIR, \"parquet\")\n", + "MEDIUM_RANGE_FORCING_PARQUET = os.path.join(PARQUET_CACHE_DIR, \"forcing_medium_range\")\n", + "FORCING_ANALYSIS_ASSIM_PARQUET = os.path.join(PARQUET_CACHE_DIR, \"forcing_analysis_assim\")\n", + "MEDIUM_RANGE_PARQUET = os.path.join(PARQUET_CACHE_DIR, \"medium_range\")\n", + "USGS_PARQUET = os.path.join(PARQUET_CACHE_DIR, \"usgs\")\n", + "\n", + "HUC10_SHP_FILEPATH = os.path.join(GEO_CACHE_DIR, \"wbdhu10_conus.shp\")\n", + "HUC10_PARQUET_FILEPATH = os.path.join(GEO_CACHE_DIR, \"wbdhu10_conus.parquet\")\n", + "HUC10_MEDIUM_RANGE_WEIGHTS_FILEPATH = os.path.join(GEO_CACHE_DIR, \"wbdhu10_medium_range_weights.pkl\")\n", + "\n", + "ROUTE_LINK_FILE = os.path.join(NWM_CACHE_DIR, \"RouteLink_CONUS.nc\")\n", + "ROUTE_LINK_PARQUET = os.path.join(NWM_CACHE_DIR, \"route_link_conus.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "82731a4e-70c8-4cc8-a6e1-2de5ff14a947", + "metadata": {}, + "outputs": [], + "source": [ + "def parquet_to_gdf(parquet_filepath: str) -> gpd.GeoDataFrame:\n", + " gdf = gpd.read_parquet(parquet_filepath)\n", + " return gdf\n", + "\n", + "def get_cache_dir(create: bool = True):\n", + " if not os.path.exists(NWM_CACHE_DIR) and create:\n", + " os.mkdir(NWM_CACHE_DIR)\n", + " if not os.path.exists(NWM_CACHE_DIR):\n", + " raise NotADirectoryError\n", + " return NWM_CACHE_DIR\n", + "\n", + "\n", + "def make_parent_dir(filepath):\n", + " Path(filepath).parent.mkdir(parents=True, exist_ok=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "74895a20-a42e-48ff-bff8-313a0ae26d14", + "metadata": {}, + "outputs": [], + "source": [ + "def get_dataset(\n", + " blob_name: str,\n", + " use_cache: bool = True\n", + ") -> xr.Dataset:\n", + " \"\"\"Retrieve a blob from the data service as xarray.Dataset.\n", + " Based largely on OWP HydroTools.\n", + " Parameters\n", + " ----------\n", + " blob_name: str, required\n", + " Name of blob to retrieve.\n", + " use_cacahe: bool, default True\n", + " If cache should be used. \n", + " If True, checks to see if file is in cache, and \n", + " if fetched from remote will save to cache.\n", + " Returns\n", + " -------\n", + " ds : xarray.Dataset\n", + " The data stored in the blob.\n", + " \"\"\"\n", + " nc_filepath = os.path.join(get_cache_dir(), blob_name)\n", + " make_parent_dir(nc_filepath)\n", + "\n", + " # If the file exists and use_cache = True\n", + " if os.path.exists(nc_filepath) and use_cache:\n", + " # Get dataset from cache\n", + " ds = xr.load_dataset(\n", + " nc_filepath,\n", + " engine='h5netcdf',\n", + " )\n", + " return ds\n", + " else:\n", + " # Get raw bytes\n", + " raw_bytes = get_blob(blob_name)\n", + " # Create Dataset\n", + " ds = xr.load_dataset(\n", + " MemoryFile(raw_bytes),\n", + " engine='h5netcdf',\n", + " )\n", + " if use_cache:\n", + " # Subset and cache\n", + " ds[\"RAINRATE\"].to_netcdf(\n", + " nc_filepath,\n", + " engine='h5netcdf',\n", + " )\n", + " return ds\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b199b72a-0220-4583-ac3a-9f27c1586fe1", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def generate_weights_file(\n", + " gdf: gpd.GeoDataFrame,\n", + " src: xr.DataArray,\n", + " weights_filepath: str,\n", + " crosswalk_dict_key: str, \n", + "):\n", + " \"\"\"Generate a weights file.\"\"\"\n", + " \n", + " gdf_proj = gdf.to_crs(CONUS_NWM_WKT)\n", + " \n", + " crosswalk_dict = {}\n", + " \n", + " # This is a probably a really poor performing way to do this \n", + " for index, row in gdf_proj.iterrows():\n", + " geom_rasterize = rasterize([(row[\"geometry\"], 1)],\n", + " out_shape=src.rio.shape,\n", + " transform=src.rio.transform(),\n", + " all_touched=True,\n", + " fill=0,\n", + " dtype='uint8')\n", + " if crosswalk_dict_key:\n", + " crosswalk_dict[row[crosswalk_dict_key]\n", + " ] = np.where(geom_rasterize == 1)\n", + " else:\n", + " crosswalk_dict[index] = np.where(geom_rasterize == 1)\n", + "\n", + " with open(weights_filepath, 'wb') as f:\n", + " pickle.dump(crosswalk_dict, f)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "dec7dd6a-483b-4401-9690-4fb213d2fc85", + "metadata": {}, + "outputs": [], + "source": [ + "def add_zonalstats_to_gdf_weights(\n", + " gdf: gpd.GeoDataFrame,\n", + " src: xr.DataArray,\n", + " weights_filepath: str,\n", + ") -> gpd.GeoDataFrame:\n", + " \"\"\"Calculates zonal stats and adds to GeoDataFrame\"\"\"\n", + "\n", + " df = calc_zonal_stats_weights(src, weights_filepath)\n", + " gdf_map = gdf.merge(df, left_on=\"huc10\", right_on=\"catchment_id\")\n", + "\n", + " return gdf_map" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "cad942dd-29ec-4ed8-be6d-e5cc13dde0ef", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import storage\n", + "def get_blob(\n", + " blob_name: str,\n", + " bucket: str = NWM_BUCKET\n", + ") -> bytes:\n", + " \"\"\"Retrieve a blob from the data service as bytes.\n", + " Based largely on OWP HydroTools.\n", + " Parameters\n", + " ----------\n", + " blob_name : str, required\n", + " Name of blob to retrieve.\n", + " Returns\n", + " -------\n", + " data : bytes\n", + " The data stored in the blob.\n", + " \"\"\"\n", + " # Setup anonymous client and retrieve blob data\n", + " client = storage.Client.create_anonymous_client()\n", + " bucket = client.bucket(bucket)\n", + " return bucket.blob(blob_name).download_as_bytes(timeout=120)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "bb25a27c-fd98-478c-97ef-c53dbd73a6f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 305 ms, sys: 46.5 ms, total: 351 ms\n", + "Wall time: 356 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "### MAIN\n", + "ds = get_dataset(TEMPLATE_BLOB_NAME, use_cache=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "e8e7f87d-ae19-454e-b958-2668e15d5c10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.56 s, sys: 52.4 ms, total: 1.62 s\n", + "Wall time: 1.65 s\n" + ] + } + ], + "source": [ + "%%time\n", + "polygonfile = gpd.read_file(\"03w/nextgen_03W.gpkg\",layer=\"divides\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "95f6ad0a-ec08-42aa-a9a9-4694dd7ad1e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 171 ms, sys: 21.4 ms, total: 193 ms\n", + "Wall time: 209 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "polygonfile.to_parquet(\"03w/ng_03W.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "d540ca78-43d5-4387-bd77-c6d8848ca576", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 271 ms, sys: 53.4 ms, total: 324 ms\n", + "Wall time: 340 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "gdf = parquet_to_gdf(\"03w/ng_03W.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "fb1dc54e-89ff-4d91-b73e-350da31e3ebb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 27min 19s, sys: 1min 20s, total: 28min 39s\n", + "Wall time: 36min 34s\n" + ] + } + ], + "source": [ + "%%time\n", + "src = ds[\"RAINRATE\"]\n", + "generate_weights_file(gdf, src, \"data_3/weights.pkl\", crosswalk_dict_key=\"id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "d2101a68-2365-41db-a226-e70f661ed57a", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install ipdb\n", + "def calc_zonal_stats_weights(\n", + " src: xr.DataArray,\n", + " weights_filepath: str,\n", + ") -> pd.DataFrame:\n", + " \"\"\"Calculates zonal stats\"\"\"\n", + "\n", + " # Open weights dict from pickle\n", + " # This could probably be done once and passed as a reference.\n", + " with open(weights_filepath, 'rb') as f:\n", + " crosswalk_dict = pickle.load(f)\n", + "\n", + " r_array = src.values[0]\n", + " r_array[r_array == src.rio.nodata] = np.nan\n", + "\n", + " mean_dict = {}\n", + " for key, value in crosswalk_dict.items():\n", + " mean_dict[key] = np.nanmean(r_array[value])\n", + "\n", + " df = pd.DataFrame.from_dict(mean_dict,\n", + " orient='index',\n", + " columns=['value'])\n", + "\n", + " df.reset_index(inplace=True, names=\"catchment_id\")\n", + "\n", + " # This should not be needed, but without memory usage grows\n", + " del crosswalk_dict\n", + " del f\n", + " gc.collect()\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "cc543b89-9335-4a30-bdae-83c30293d6dd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 752 ms, sys: 13.8 ms, total: 765 ms\n", + "Wall time: 767 ms\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
catchment_idvalue
0cat-1070680.0
1cat-1072330.0
2cat-1076830.0
3cat-1081670.0
4cat-1129770.0
.........
30867cat-1375540.0
30868cat-1068850.0
30869cat-1370710.0
30870cat-1370680.0
30871cat-1377480.0
\n", + "

30872 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " catchment_id value\n", + "0 cat-107068 0.0\n", + "1 cat-107233 0.0\n", + "2 cat-107683 0.0\n", + "3 cat-108167 0.0\n", + "4 cat-112977 0.0\n", + "... ... ...\n", + "30867 cat-137554 0.0\n", + "30868 cat-106885 0.0\n", + "30869 cat-137071 0.0\n", + "30870 cat-137068 0.0\n", + "30871 cat-137748 0.0\n", + "\n", + "[30872 rows x 2 columns]" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "calc_zonal_stats_weights(src, \"data_3/weights.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "fa8c71d3-2fd1-46eb-8fee-d3f0ddb00929", + "metadata": {}, + "outputs": [], + "source": [ + "def get_forcing_dict_RTIway(\n", + " pickle_file, # This would be a Feature list for parallel calling -- \n", + " # if there is a stored weights file, we use it \n", + " # (checking for an optional flag to force re-creation of the weights...)\n", + " folder_prefix,\n", + " file_list,\n", + "):\n", + "\n", + " var = \"RAINRATE\"\n", + " reng = \"rasterio\"\n", + " filehandles = [xr.open_dataset(folder_prefix / f, engine=reng)[var] for f in file_list]\n", + " # filehandles = [get_dataset(\"data/\" + f, use_cache=True) for f in file_list]\n", + " stats = []\n", + "\n", + " for _i, f in enumerate(filehandles):\n", + " print(f\"{_i}, {round(_i/len(file_list), 2)*100}\".ljust(40), end=\"\\r\")\n", + " stats.append(calc_zonal_stats_weights(f, pickle_file))\n", + "\n", + " [f.close() for f in filehandles]\n", + " return stats\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "id": "129e1681-d89c-4636-a487-0432fe7e283c", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def get_forcing_dict_RTIway2(\n", + " pickle_file, # This would be a Feature list for parallel calling -- \n", + " # if there is a stored weights file, we use it \n", + " # (checking for an optional flag to force re-creation of the weights...)\n", + " gpkg_divides,\n", + " folder_prefix,\n", + " filelist,\n", + " var_list,\n", + "):\n", + " reng = \"rasterio\"\n", + " pick_val = \"value\"\n", + "\n", + " df_dict = {}\n", + " for _v in var_list:\n", + " df_dict[_v] = pd.DataFrame(index=gpkg_divides.index)\n", + "\n", + " ds_list = []\n", + " for _i, _nc_file in enumerate(filelist):\n", + " # _nc_file = (\"nwm.t00z.medium_range.forcing.f001.conus.nc\")\n", + " _full_nc_file = folder_prefix.joinpath(_nc_file)\n", + "\n", + " with xr.open_dataset(_full_nc_file, engine=reng) as _xds:\n", + " #_xds = ds_list[_i]\n", + " print(f\"{_i}, {round(_i/len(filelist), 5)*100}\".ljust(40), end=\"\\r\")\n", + " for _v in var_list:\n", + " _src = _xds[_v]\n", + "\n", + " # import ipdb; ipdb.set_trace()\n", + " _df_zonal_stats = (calc_zonal_stats_weights(_src, pickle_file))\n", + " # if adding statistics back to original GeoDataFrame\n", + " # gdf3 = pd.concat([gpkg_divides, _df_zonal_stats], axis=1)\n", + " df_dict[_v][_xds.time.values[0]] = _df_zonal_stats[pick_val]\n", + "\n", + " [_xds.close() for _xds in ds_list]\n", + "\n", + " return df_dict\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8459aaa9-11ef-40d7-bced-1217ab59a2f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Working on the new way\n", + "44, 18.333 \r" + ] + } + ], + "source": [ + "%%time\n", + "import time\n", + "folder_prefix = Path(\"data\")\n", + "list_of_files = [\n", + " f\"nwm.t12z.medium_range.forcing.f{_r:03}.conus.nc\" for _r in range(1, 241)\n", + "]\n", + "\n", + "f_03 = \"03w/nextgen_03W.gpkg\"\n", + "gpkg_divides = gpd.read_file(f_03, layer=\"divides\")\n", + "var_list = [\n", + " \"U2D\",\n", + " \"V2D\",\n", + " \"LWDOWN\",\n", + " \"RAINRATE\",\n", + " \"T2D\",\n", + " \"Q2D\",\n", + " \"PSFC\",\n", + " \"SWDOWN\",\n", + "]\n", + "\n", + "\n", + "# file_list = list_of_files[0:18]\n", + "file_list = list_of_files[0:3]\n", + "file_list = list_of_files[0:]\n", + "\n", + "pickle_file = \"data_3/weights.pkl\"\n", + "\n", + "start_time = time.time()\n", + "print(f\"Working on the new way\")\n", + "fd2 = get_forcing_dict_RTIway2(\n", + " pickle_file,\n", + " gpkg_divides,\n", + " folder_prefix,\n", + " file_list,\n", + " var_list,\n", + ")\n", + "print(time.time() - start_time)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "358d9a99-2016-4c51-b65c-e9cd92fce358", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'U2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 0.093212 -0.436970 -0.097939\n", + " 1 -0.008313 -0.587000 -0.435375\n", + " 2 -0.442512 -0.937268 -0.507317\n", + " 3 -0.178103 -0.600897 -0.614621\n", + " 4 -0.694703 -1.009649 -1.231270\n", + " ... ... ... ...\n", + " 30867 -1.230500 -1.497500 -1.447500\n", + " 30868 -0.728547 -0.795235 -0.682782\n", + " 30869 0.348000 0.298625 0.092250\n", + " 30870 0.331000 0.297000 0.093000\n", + " 30871 -0.856077 -1.326692 -1.195231\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'V2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 -2.467212 -2.618788 -2.710545\n", + " 1 -1.119000 -2.423063 -2.262000\n", + " 2 -1.036951 -1.427024 -1.693463\n", + " 3 -1.777345 -2.729759 -2.693241\n", + " 4 -1.849676 -1.769676 -1.616486\n", + " ... ... ... ...\n", + " 30867 -0.698500 -0.684000 -0.593500\n", + " 30868 -2.663799 -2.709721 -2.727983\n", + " 30869 -2.689000 -3.307000 -3.045625\n", + " 30870 -2.702000 -3.304000 -3.041000\n", + " 30871 -0.444692 -0.183462 -0.355769\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'LWDOWN': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 322.563606 316.459333 319.125182\n", + " 1 392.175313 395.426188 398.251188\n", + " 2 398.620732 394.783537 409.061756\n", + " 3 391.672207 382.839034 398.388034\n", + " 4 343.990892 347.926405 352.131730\n", + " ... ... ... ...\n", + " 30867 319.904000 331.289000 341.212000\n", + " 30868 311.615291 313.601397 313.269972\n", + " 30869 328.200500 334.166375 379.226875\n", + " 30870 328.220000 334.226000 379.722000\n", + " 30871 353.785769 324.706538 329.218692\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'RAINRATE': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 0.000000e+00 0.000000e+00 0.000000e+00\n", + " 1 5.745148e-07 2.020275e-07 1.694254e-06\n", + " 2 3.051579e-07 2.494036e-08 5.281748e-08\n", + " 3 0.000000e+00 0.000000e+00 1.281399e-08\n", + " 4 0.000000e+00 0.000000e+00 0.000000e+00\n", + " ... ... ... ...\n", + " 30867 0.000000e+00 0.000000e+00 0.000000e+00\n", + " 30868 0.000000e+00 0.000000e+00 0.000000e+00\n", + " 30869 0.000000e+00 8.736407e-07 4.344080e-07\n", + " 30870 0.000000e+00 8.062991e-07 4.031495e-07\n", + " 30871 0.000000e+00 0.000000e+00 0.000000e+00\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'T2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 285.556061 287.930303 289.624848\n", + " 1 287.690625 290.366875 289.825000\n", + " 2 288.297805 290.214146 291.377317\n", + " 3 287.798621 290.006897 290.491724\n", + " 4 289.404054 291.912432 293.802703\n", + " ... ... ... ...\n", + " 30867 285.505000 287.790000 290.055000\n", + " 30868 286.481676 288.548603 290.101508\n", + " 30869 287.532500 288.567500 287.537500\n", + " 30870 287.550000 288.570000 287.550000\n", + " 30871 286.033846 288.870000 291.253077\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'Q2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 0.008322 0.007559 0.006950\n", + " 1 0.009809 0.009331 0.009382\n", + " 2 0.010486 0.010341 0.010183\n", + " 3 0.009420 0.009144 0.008930\n", + " 4 0.010069 0.009712 0.009377\n", + " ... ... ... ...\n", + " 30867 0.008171 0.007912 0.007719\n", + " 30868 0.006779 0.006366 0.006304\n", + " 30869 0.008371 0.007918 0.007822\n", + " 30870 0.008359 0.007912 0.007823\n", + " 30871 0.007981 0.007854 0.007728\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'PSFC': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 97608.357576 97617.875758 97623.369697\n", + " 1 98797.675000 98810.750000 98835.112500\n", + " 2 98744.995122 98773.275610 98794.758537\n", + " 3 98500.610345 98523.958621 98538.641379\n", + " 4 98549.475676 98560.543243 98548.802703\n", + " ... ... ... ...\n", + " 30867 98681.500000 98687.650000 98663.800000\n", + " 30868 97187.253073 97204.259218 97203.415642\n", + " 30869 98678.750000 98714.250000 98733.575000\n", + " 30870 98726.200000 98761.900000 98781.400000\n", + " 30871 98472.423077 98459.492308 98439.884615\n", + " \n", + " [30872 rows x 3 columns],\n", + " 'SWDOWN': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", + " 0 298.820667 499.120333 643.196424\n", + " 1 33.838313 256.588375 88.550188\n", + " 2 24.711732 200.719366 232.752341\n", + " 3 44.000414 316.280138 241.046621\n", + " 4 262.306459 440.323297 593.502919\n", + " ... ... ... ...\n", + " 30867 263.077500 409.978000 538.065500\n", + " 30868 322.893899 488.363173 644.919229\n", + " 30869 297.890750 435.522875 208.013250\n", + " 30870 296.160000 432.068000 204.971000\n", + " 30871 184.605154 470.781846 616.673769\n", + " \n", + " [30872 rows x 3 columns]}" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fd2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d37d2590-1de1-4e9c-85f2-9c7a1172f07c", + "metadata": {}, + "outputs": [], + "source": [ + "# %%time\n", + "# if 1 == 1:\n", + "# _f = folder_prefix / list_of_files[0]\n", + "# src = xr.open_dataset(_f)[\"RAINRATE\"]\n", + "# generate_weights_file(gdf, src, \"data_3/weights_alt.pkl\", crosswalk_dict_key=\"id\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "ea5887b5-b637-4642-bd47-ade6118e6532", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "240" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(fd2)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "61e4d101-7973-4e64-8619-4ca6ab9a0a18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
catchment_idvalue
0cat-1070680.000000e+00
1cat-1072335.745148e-07
2cat-1076833.051579e-07
3cat-1081670.000000e+00
4cat-1129770.000000e+00
.........
30867cat-1375540.000000e+00
30868cat-1068850.000000e+00
30869cat-1370710.000000e+00
30870cat-1370680.000000e+00
30871cat-1377480.000000e+00
\n", + "

30872 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " catchment_id value\n", + "0 cat-107068 0.000000e+00\n", + "1 cat-107233 5.745148e-07\n", + "2 cat-107683 3.051579e-07\n", + "3 cat-108167 0.000000e+00\n", + "4 cat-112977 0.000000e+00\n", + "... ... ...\n", + "30867 cat-137554 0.000000e+00\n", + "30868 cat-106885 0.000000e+00\n", + "30869 cat-137071 0.000000e+00\n", + "30870 cat-137068 0.000000e+00\n", + "30871 cat-137748 0.000000e+00\n", + "\n", + "[30872 rows x 2 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fd2[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3885ed0-8b83-46af-afc9-75639ff8b7c3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pcp_var = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"APCP_surface\")\n", + "lw_var = pd.read_csv(\"LWDOWN.csv\", index_col=0)#.rename(\"DLWRF_surface\")\n", + "sw_var = pd.read_csv(\"SWDOWN.csv\", index_col=0)#.rename(\"DSWRF_surface\")\n", + "sp_var = pd.read_csv(\"PSFC.csv\", index_col=0)#.rename(\"SPFH_2maboveground\")\n", + "tmp_var = pd.read_csv(\"T2D.csv\", index_col=0)#.rename(\"TMP_2maboveground\")\n", + "u2d_var = pd.read_csv(\"U2D.csv\", index_col=0)#.rename(\"UGRD_10maboveground\")\n", + "v2d_var = pd.read_csv(\"V2D.csv\", index_col=0)#.rename(\"VGRD_10maboveground\")\n", + "pcp_var2 = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"precip_rate\") ##BROKEN!!\n", + "\n", + "for _i in range(0,40000):\n", + " #_i = 0\n", + " try:\n", + " pcp_var_0 = pcp_var.transpose()[_i].rename(\"APCP_surface\")\n", + " lw_var_0 = lw_var.transpose()[_i].rename(\"DLWRF_surface\")\n", + " sw_var_0 = sw_var.transpose()[_i].rename(\"DSWRF_surface\")\n", + " sp_var_0 = sp_var.transpose()[_i].rename(\"SPFH_2maboveground\")\n", + " tmp_var_0 = tmp_var.transpose()[_i].rename(\"TMP_2maboveground\")\n", + " u2d_var_0 = u2d_var.transpose()[_i].rename(\"UGRD_10maboveground\")\n", + " v2d_var_0 = v2d_var.transpose()[_i].rename(\"VGRD_10maboveground\")\n", + " pcp_var2_0 = pcp_var2.transpose()[_i].rename(\"precip_rate\") ##BROKEN!!\n", + "\n", + " d = pd.concat([pcp_var_0, lw_var_0, sw_var_0, sp_var_0, tmp_var_0, u2d_var_0, v2d_var_0, pcp_var2_0], axis=1)\n", + " d.index.name = \"time\"\n", + "\n", + "\n", + " d.to_csv(f\"input_data/cat03w_{_i:07}.csv\")\n", + " except:\n", + " print(f\"no data for watershed {_i}\", end=\"\\t\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From eea2b8648a1be977d0e08d58d34161503e7cc8c5 Mon Sep 17 00:00:00 2001 From: James Halgren Date: Fri, 10 Feb 2023 08:50:47 -0600 Subject: [PATCH 11/11] complete workflow --- .../VERYROUGH_RTI_Forcing_example.ipynb | 539 +++--------------- 1 file changed, 68 insertions(+), 471 deletions(-) diff --git a/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb b/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb index 2ff823e..acba744 100644 --- a/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb +++ b/ngen_forcing/VERYROUGH_RTI_Forcing_example.ipynb @@ -2,21 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "id": "c4b19730-d455-418f-a3a3-15efec3a8387", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: geoparquet in /Users/jshalgren/pyenv310/lib/python3.10/site-packages (0.0.3)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "# !pip install config\n", "!pip install geoparquet \n", @@ -43,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": null, "id": "e9159453-76c6-4bbe-b08f-a746a54f48f5", "metadata": {}, "outputs": [], @@ -69,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "eaba8b05-2b98-4aea-974f-d26de67f729b", "metadata": {}, "outputs": [], @@ -101,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "82731a4e-70c8-4cc8-a6e1-2de5ff14a947", "metadata": {}, "outputs": [], @@ -125,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "74895a20-a42e-48ff-bff8-313a0ae26d14", "metadata": {}, "outputs": [], @@ -181,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "b199b72a-0220-4583-ac3a-9f27c1586fe1", "metadata": {}, "outputs": [], @@ -221,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "id": "dec7dd6a-483b-4401-9690-4fb213d2fc85", "metadata": {}, "outputs": [], @@ -241,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "cad942dd-29ec-4ed8-be6d-e5cc13dde0ef", "metadata": {}, "outputs": [], @@ -271,19 +260,10 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "id": "bb25a27c-fd98-478c-97ef-c53dbd73a6f1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 305 ms, sys: 46.5 ms, total: 351 ms\n", - "Wall time: 356 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "### MAIN\n", @@ -292,19 +272,10 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "id": "e8e7f87d-ae19-454e-b958-2668e15d5c10", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1.56 s, sys: 52.4 ms, total: 1.62 s\n", - "Wall time: 1.65 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "polygonfile = gpd.read_file(\"03w/nextgen_03W.gpkg\",layer=\"divides\")" @@ -312,19 +283,10 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "95f6ad0a-ec08-42aa-a9a9-4694dd7ad1e0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 171 ms, sys: 21.4 ms, total: 193 ms\n", - "Wall time: 209 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "polygonfile.to_parquet(\"03w/ng_03W.parquet\")" @@ -332,19 +294,10 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "d540ca78-43d5-4387-bd77-c6d8848ca576", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 271 ms, sys: 53.4 ms, total: 324 ms\n", - "Wall time: 340 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "gdf = parquet_to_gdf(\"03w/ng_03W.parquet\")" @@ -352,19 +305,10 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "fb1dc54e-89ff-4d91-b73e-350da31e3ebb", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 27min 19s, sys: 1min 20s, total: 28min 39s\n", - "Wall time: 36min 34s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "src = ds[\"RAINRATE\"]\n", @@ -373,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": null, "id": "d2101a68-2365-41db-a226-e70f661ed57a", "metadata": {}, "outputs": [], @@ -413,126 +357,10 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": null, "id": "cc543b89-9335-4a30-bdae-83c30293d6dd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 752 ms, sys: 13.8 ms, total: 765 ms\n", - "Wall time: 767 ms\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
catchment_idvalue
0cat-1070680.0
1cat-1072330.0
2cat-1076830.0
3cat-1081670.0
4cat-1129770.0
.........
30867cat-1375540.0
30868cat-1068850.0
30869cat-1370710.0
30870cat-1370680.0
30871cat-1377480.0
\n", - "

30872 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " catchment_id value\n", - "0 cat-107068 0.0\n", - "1 cat-107233 0.0\n", - "2 cat-107683 0.0\n", - "3 cat-108167 0.0\n", - "4 cat-112977 0.0\n", - "... ... ...\n", - "30867 cat-137554 0.0\n", - "30868 cat-106885 0.0\n", - "30869 cat-137071 0.0\n", - "30870 cat-137068 0.0\n", - "30871 cat-137748 0.0\n", - "\n", - "[30872 rows x 2 columns]" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "calc_zonal_stats_weights(src, \"data_3/weights.pkl\")" @@ -540,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": null, "id": "fa8c71d3-2fd1-46eb-8fee-d3f0ddb00929", "metadata": {}, "outputs": [], @@ -570,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": null, "id": "129e1681-d89c-4636-a487-0432fe7e283c", "metadata": {}, "outputs": [], @@ -620,16 +448,7 @@ "execution_count": null, "id": "8459aaa9-11ef-40d7-bced-1217ab59a2f5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Working on the new way\n", - "44, 18.333 \r" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "import time\n", @@ -671,138 +490,6 @@ "\n" ] }, - { - "cell_type": "code", - "execution_count": 176, - "id": "358d9a99-2016-4c51-b65c-e9cd92fce358", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'U2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 0.093212 -0.436970 -0.097939\n", - " 1 -0.008313 -0.587000 -0.435375\n", - " 2 -0.442512 -0.937268 -0.507317\n", - " 3 -0.178103 -0.600897 -0.614621\n", - " 4 -0.694703 -1.009649 -1.231270\n", - " ... ... ... ...\n", - " 30867 -1.230500 -1.497500 -1.447500\n", - " 30868 -0.728547 -0.795235 -0.682782\n", - " 30869 0.348000 0.298625 0.092250\n", - " 30870 0.331000 0.297000 0.093000\n", - " 30871 -0.856077 -1.326692 -1.195231\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'V2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 -2.467212 -2.618788 -2.710545\n", - " 1 -1.119000 -2.423063 -2.262000\n", - " 2 -1.036951 -1.427024 -1.693463\n", - " 3 -1.777345 -2.729759 -2.693241\n", - " 4 -1.849676 -1.769676 -1.616486\n", - " ... ... ... ...\n", - " 30867 -0.698500 -0.684000 -0.593500\n", - " 30868 -2.663799 -2.709721 -2.727983\n", - " 30869 -2.689000 -3.307000 -3.045625\n", - " 30870 -2.702000 -3.304000 -3.041000\n", - " 30871 -0.444692 -0.183462 -0.355769\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'LWDOWN': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 322.563606 316.459333 319.125182\n", - " 1 392.175313 395.426188 398.251188\n", - " 2 398.620732 394.783537 409.061756\n", - " 3 391.672207 382.839034 398.388034\n", - " 4 343.990892 347.926405 352.131730\n", - " ... ... ... ...\n", - " 30867 319.904000 331.289000 341.212000\n", - " 30868 311.615291 313.601397 313.269972\n", - " 30869 328.200500 334.166375 379.226875\n", - " 30870 328.220000 334.226000 379.722000\n", - " 30871 353.785769 324.706538 329.218692\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'RAINRATE': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 0.000000e+00 0.000000e+00 0.000000e+00\n", - " 1 5.745148e-07 2.020275e-07 1.694254e-06\n", - " 2 3.051579e-07 2.494036e-08 5.281748e-08\n", - " 3 0.000000e+00 0.000000e+00 1.281399e-08\n", - " 4 0.000000e+00 0.000000e+00 0.000000e+00\n", - " ... ... ... ...\n", - " 30867 0.000000e+00 0.000000e+00 0.000000e+00\n", - " 30868 0.000000e+00 0.000000e+00 0.000000e+00\n", - " 30869 0.000000e+00 8.736407e-07 4.344080e-07\n", - " 30870 0.000000e+00 8.062991e-07 4.031495e-07\n", - " 30871 0.000000e+00 0.000000e+00 0.000000e+00\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'T2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 285.556061 287.930303 289.624848\n", - " 1 287.690625 290.366875 289.825000\n", - " 2 288.297805 290.214146 291.377317\n", - " 3 287.798621 290.006897 290.491724\n", - " 4 289.404054 291.912432 293.802703\n", - " ... ... ... ...\n", - " 30867 285.505000 287.790000 290.055000\n", - " 30868 286.481676 288.548603 290.101508\n", - " 30869 287.532500 288.567500 287.537500\n", - " 30870 287.550000 288.570000 287.550000\n", - " 30871 286.033846 288.870000 291.253077\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'Q2D': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 0.008322 0.007559 0.006950\n", - " 1 0.009809 0.009331 0.009382\n", - " 2 0.010486 0.010341 0.010183\n", - " 3 0.009420 0.009144 0.008930\n", - " 4 0.010069 0.009712 0.009377\n", - " ... ... ... ...\n", - " 30867 0.008171 0.007912 0.007719\n", - " 30868 0.006779 0.006366 0.006304\n", - " 30869 0.008371 0.007918 0.007822\n", - " 30870 0.008359 0.007912 0.007823\n", - " 30871 0.007981 0.007854 0.007728\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'PSFC': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 97608.357576 97617.875758 97623.369697\n", - " 1 98797.675000 98810.750000 98835.112500\n", - " 2 98744.995122 98773.275610 98794.758537\n", - " 3 98500.610345 98523.958621 98538.641379\n", - " 4 98549.475676 98560.543243 98548.802703\n", - " ... ... ... ...\n", - " 30867 98681.500000 98687.650000 98663.800000\n", - " 30868 97187.253073 97204.259218 97203.415642\n", - " 30869 98678.750000 98714.250000 98733.575000\n", - " 30870 98726.200000 98761.900000 98781.400000\n", - " 30871 98472.423077 98459.492308 98439.884615\n", - " \n", - " [30872 rows x 3 columns],\n", - " 'SWDOWN': 2022-08-24 13:00:00 2022-08-24 14:00:00 2022-08-24 15:00:00\n", - " 0 298.820667 499.120333 643.196424\n", - " 1 33.838313 256.588375 88.550188\n", - " 2 24.711732 200.719366 232.752341\n", - " 3 44.000414 316.280138 241.046621\n", - " 4 262.306459 440.323297 593.502919\n", - " ... ... ... ...\n", - " 30867 263.077500 409.978000 538.065500\n", - " 30868 322.893899 488.363173 644.919229\n", - " 30869 297.890750 435.522875 208.013250\n", - " 30870 296.160000 432.068000 204.971000\n", - " 30871 184.605154 470.781846 616.673769\n", - " \n", - " [30872 rows x 3 columns]}" - ] - }, - "execution_count": 176, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fd2" - ] - }, { "cell_type": "code", "execution_count": null, @@ -820,141 +507,22 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": null, "id": "ea5887b5-b637-4642-bd47-ade6118e6532", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "240" - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(fd2)" ] }, { "cell_type": "code", - "execution_count": 151, + "execution_count": null, "id": "61e4d101-7973-4e64-8619-4ca6ab9a0a18", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
catchment_idvalue
0cat-1070680.000000e+00
1cat-1072335.745148e-07
2cat-1076833.051579e-07
3cat-1081670.000000e+00
4cat-1129770.000000e+00
.........
30867cat-1375540.000000e+00
30868cat-1068850.000000e+00
30869cat-1370710.000000e+00
30870cat-1370680.000000e+00
30871cat-1377480.000000e+00
\n", - "

30872 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " catchment_id value\n", - "0 cat-107068 0.000000e+00\n", - "1 cat-107233 5.745148e-07\n", - "2 cat-107683 3.051579e-07\n", - "3 cat-108167 0.000000e+00\n", - "4 cat-112977 0.000000e+00\n", - "... ... ...\n", - "30867 cat-137554 0.000000e+00\n", - "30868 cat-106885 0.000000e+00\n", - "30869 cat-137071 0.000000e+00\n", - "30870 cat-137068 0.000000e+00\n", - "30871 cat-137748 0.000000e+00\n", - "\n", - "[30872 rows x 2 columns]" - ] - }, - "execution_count": 151, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "fd2[0]\n" + "fd2[\"U2D\"]\n" ] }, { @@ -966,14 +534,22 @@ "source": [ "import pandas as pd\n", "\n", - "pcp_var = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"APCP_surface\")\n", - "lw_var = pd.read_csv(\"LWDOWN.csv\", index_col=0)#.rename(\"DLWRF_surface\")\n", - "sw_var = pd.read_csv(\"SWDOWN.csv\", index_col=0)#.rename(\"DSWRF_surface\")\n", - "sp_var = pd.read_csv(\"PSFC.csv\", index_col=0)#.rename(\"SPFH_2maboveground\")\n", - "tmp_var = pd.read_csv(\"T2D.csv\", index_col=0)#.rename(\"TMP_2maboveground\")\n", - "u2d_var = pd.read_csv(\"U2D.csv\", index_col=0)#.rename(\"UGRD_10maboveground\")\n", - "v2d_var = pd.read_csv(\"V2D.csv\", index_col=0)#.rename(\"VGRD_10maboveground\")\n", - "pcp_var2 = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"precip_rate\") ##BROKEN!!\n", + "# pcp_var = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"APCP_surface\")\n", + "# lw_var = pd.read_csv(\"LWDOWN.csv\", index_col=0)#.rename(\"DLWRF_surface\")\n", + "# sw_var = pd.read_csv(\"SWDOWN.csv\", index_col=0)#.rename(\"DSWRF_surface\")\n", + "# sp_var = pd.read_csv(\"PSFC.csv\", index_col=0)#.rename(\"SPFH_2maboveground\")\n", + "# tmp_var = pd.read_csv(\"T2D.csv\", index_col=0)#.rename(\"TMP_2maboveground\")\n", + "# u2d_var = pd.read_csv(\"U2D.csv\", index_col=0)#.rename(\"UGRD_10maboveground\")\n", + "# v2d_var = pd.read_csv(\"V2D.csv\", index_col=0)#.rename(\"VGRD_10maboveground\")\n", + "# pcp_var2 = pd.read_csv(\"RAINRATE.csv\", index_col=0)#.rename(\"precip_rate\") ##BROKEN!!\n", + "pcp_var = fd2[\"RAINRATE\"] \n", + "lw_var = fd2[\"LWDOWN\"]\n", + "sw_var = fd2[\"SWDOWN\"]\n", + "sp_var = fd2[\"PSFC\"]\n", + "tmp_var = fd2[\"T2D\"]\n", + "u2d_var = fd2[\"U2D\"]\n", + "v2d_var = fd2[\"V2D\"]\n", + "pcp_var2 = fd2[\"RAINRATE\"]\n", "\n", "for _i in range(0,40000):\n", " #_i = 0\n", @@ -991,10 +567,31 @@ " d.index.name = \"time\"\n", "\n", "\n", - " d.to_csv(f\"input_data/cat03w_{_i:07}.csv\")\n", + " d.to_csv(f\"input_data_240/cat03w_{_i:07}.csv\")\n", " except:\n", " print(f\"no data for watershed {_i}\", end=\"\\t\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "358d9a99-2016-4c51-b65c-e9cd92fce358", + "metadata": {}, + "outputs": [], + "source": [ + "## Make a shell script string to rename the csvs...\n", + "gpkg_divides[\"id\"]\n", + "for _i, cat_id in enumerate(gpkg_divides[\"id\"]):\n", + " print(f\"mv cat03w_{_i:07}.csv cat03w_{cat_id}.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52383127-cc94-49cc-a0ba-3d3e2057fc08", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {