From 494bb1ba8f754ea43f7224e09a1c68e13a816a99 Mon Sep 17 00:00:00 2001
From: JacobSampson <jacob.samps@gmail.com>
Date: Sat, 6 Jun 2026 20:10:51 -0500
Subject: [PATCH] fix(usgs_glm): add User-Agent header and correct ScienceBase
 URL field

ScienceBase blocks requests without a User-Agent and returns 503 --
this was the root cause of the Spike 4 failures on 2026-06-06.

Also corrects the URL field lookup: ScienceBase returns download URLs
in the 'url' field, not 'downloadUrl' or 'uri' as the code assumed.
All three request sites (manifest, crosswalk, NetCDF zip) are updated.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/onkia/usgs_glm.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/onkia/usgs_glm.py b/src/onkia/usgs_glm.py
index 9653a43..4a68158 100644
--- a/src/onkia/usgs_glm.py
+++ b/src/onkia/usgs_glm.py
@@ -295,10 +295,14 @@ def fetch_from_sciencebase(
     if not missing:
         return True, "All requested lakes are already cached."
 
+    # ScienceBase blocks requests without a User-Agent header with 503.
+    _sb_session = requests.Session()
+    _sb_session.headers.update({"User-Agent": "AprovanLabs-DataScience/1.0 (requests)"})
+
     # --- fetch ScienceBase item manifest ---
     logger.info("Fetching ScienceBase item manifest from %s", _SCIENCEBASE_API)
     try:
-        resp = requests.get(_SCIENCEBASE_API, timeout=30)
+        resp = _sb_session.get(_SCIENCEBASE_API, timeout=30)
         resp.raise_for_status()
         item = resp.json()
     except Exception as exc:
@@ -309,14 +313,15 @@ def fetch_from_sciencebase(
     # --- download crosswalk CSV ---
     crosswalk_path = out_dir / "crosswalk.csv"
     if not crosswalk_path.exists():
+        # ScienceBase returns the download URL in the "url" field, not "downloadUrl" or "uri"
         cw_url = next(
-            (f.get("downloadUrl", f.get("uri", "")) for f in files
+            (f.get("url", "") for f in files
              if "crosswalk" in f.get("name", "").lower() and f.get("name", "").endswith(".csv")),
             None,
         )
         if cw_url:
             try:
-                r = requests.get(cw_url, timeout=60)
+                r = _sb_session.get(cw_url, timeout=60)
                 r.raise_for_status()
                 crosswalk_path.write_bytes(r.content)
                 logger.info("Saved crosswalk → %s", crosswalk_path)
@@ -346,15 +351,16 @@ def fetch_from_sciencebase(
         )
 
     # --- find NetCDF zip URL ---
+    # ScienceBase returns the download URL in the "url" field, not "downloadUrl" or "uri"
     zip_url = next(
-        (f.get("downloadUrl", f.get("uri", "")) for f in files
+        (f.get("url", "") for f in files
          if "lake_temp_preds_GLM_NLDAS" in f.get("name", "") and f.get("name", "").endswith(".zip")),
         None,
     )
     if not zip_url:
         # fall back to EA-LSTM
         zip_url = next(
-            (f.get("downloadUrl", f.get("uri", "")) for f in files
+            (f.get("url", "") for f in files
              if "lake_temp_preds_EALSTM" in f.get("name", "") and f.get("name", "").endswith(".zip")),
             None,
         )
@@ -363,7 +369,7 @@ def fetch_from_sciencebase(
 
     logger.info("Downloading NetCDF zip from ScienceBase (this may take several minutes)…")
     try:
-        with requests.get(zip_url, timeout=600, stream=True) as r:
+        with _sb_session.get(zip_url, timeout=600, stream=True) as r:
             r.raise_for_status()
             tmp = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
             for chunk in r.iter_content(chunk_size=65536):