From 494bb1ba8f754ea43f7224e09a1c68e13a816a99 Mon Sep 17 00:00:00 2001 From: JacobSampson Date: Sat, 6 Jun 2026 20:10:51 -0500 Subject: [PATCH] fix(usgs_glm): add User-Agent header and correct ScienceBase URL field ScienceBase blocks requests without a User-Agent and returns 503 -- this was the root cause of the Spike 4 failures on 2026-06-06. Also corrects the URL field lookup: ScienceBase returns download URLs in the 'url' field, not 'downloadUrl' or 'uri' as the code assumed. All three request sites (manifest, crosswalk, NetCDF zip) are updated. Co-Authored-By: Claude Sonnet 4.6 --- src/onkia/usgs_glm.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/onkia/usgs_glm.py b/src/onkia/usgs_glm.py index 9653a43..4a68158 100644 --- a/src/onkia/usgs_glm.py +++ b/src/onkia/usgs_glm.py @@ -295,10 +295,14 @@ def fetch_from_sciencebase( if not missing: return True, "All requested lakes are already cached." + # ScienceBase blocks requests without a User-Agent header with 503. + _sb_session = requests.Session() + _sb_session.headers.update({"User-Agent": "AprovanLabs-DataScience/1.0 (requests)"}) + # --- fetch ScienceBase item manifest --- logger.info("Fetching ScienceBase item manifest from %s", _SCIENCEBASE_API) try: - resp = requests.get(_SCIENCEBASE_API, timeout=30) + resp = _sb_session.get(_SCIENCEBASE_API, timeout=30) resp.raise_for_status() item = resp.json() except Exception as exc: @@ -309,14 +313,15 @@ def fetch_from_sciencebase( # --- download crosswalk CSV --- crosswalk_path = out_dir / "crosswalk.csv" if not crosswalk_path.exists(): + # ScienceBase returns the download URL in the "url" field, not "downloadUrl" or "uri" cw_url = next( - (f.get("downloadUrl", f.get("uri", "")) for f in files + (f.get("url", "") for f in files if "crosswalk" in f.get("name", "").lower() and f.get("name", "").endswith(".csv")), None, ) if cw_url: try: - r = requests.get(cw_url, timeout=60) + r = _sb_session.get(cw_url, timeout=60) r.raise_for_status() crosswalk_path.write_bytes(r.content) logger.info("Saved crosswalk → %s", crosswalk_path) @@ -346,15 +351,16 @@ def fetch_from_sciencebase( ) # --- find NetCDF zip URL --- + # ScienceBase returns the download URL in the "url" field, not "downloadUrl" or "uri" zip_url = next( - (f.get("downloadUrl", f.get("uri", "")) for f in files + (f.get("url", "") for f in files if "lake_temp_preds_GLM_NLDAS" in f.get("name", "") and f.get("name", "").endswith(".zip")), None, ) if not zip_url: # fall back to EA-LSTM zip_url = next( - (f.get("downloadUrl", f.get("uri", "")) for f in files + (f.get("url", "") for f in files if "lake_temp_preds_EALSTM" in f.get("name", "") and f.get("name", "").endswith(".zip")), None, ) @@ -363,7 +369,7 @@ def fetch_from_sciencebase( logger.info("Downloading NetCDF zip from ScienceBase (this may take several minutes)…") try: - with requests.get(zip_url, timeout=600, stream=True) as r: + with _sb_session.get(zip_url, timeout=600, stream=True) as r: r.raise_for_status() tmp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False) for chunk in r.iter_content(chunk_size=65536):