From 508460a427b7b5691a902a8e0ef71ed542de3a4f Mon Sep 17 00:00:00 2001
From: Jonathan St-Onge <jstonge1@uvm.edu>
Date: Wed, 4 Mar 2026 09:51:30 -0500
Subject: [PATCH 1/4] typos

---
 backend/app/routers/datalakes.py | 328 +++++++++++++++++++++++++++++--
 1 file changed, 315 insertions(+), 13 deletions(-)

diff --git a/backend/app/routers/datalakes.py b/backend/app/routers/datalakes.py
index bbd812a..1371c5a 100644
--- a/backend/app/routers/datalakes.py
+++ b/backend/app/routers/datalakes.py
@@ -229,6 +229,304 @@ async def register_datalake(
             }
         }
 
+@router.get("/search-terms2")
+async def search_terms_batch(
+    types: str = Query(..., description="Comma-separated list of ngram terms"),
+    date: Optional[str] = Query(None, description="First system focus date (YYYY-MM-DD)"),
+    date2: Optional[str] = Query(None, description="Second system focus date (YYYY-MM-DD)"),
+    location: str = Query("wikidata:Q30", description="First system location entity ID"),
+    location2: Optional[str] = Query(None, description="Second system location entity ID (defaults to location)"),
+    granularity: str = Query("daily", description="Granularity: daily, weekly, monthly"),
+    window_size: int = Query(7, description="Number of granularity periods before/after each focus date"),
+    db: AsyncSession = Depends(get_db_session)
+):
+    """
+    Batch sparkline lookup for multiple ngram terms across one or two systems.
+
+    Two comparison modes:
+    - Temporal (date + date2, same location): ONE DuckDB scan — both windows' paths merged.
+    - Geographic (date, location + location2): TWO DuckDB scans — paths live in separate geo dirs.
+
+    Results are keyed as system1/system2 so the frontend can render both sides
+    without coordinating parallel calls.
+    """
+    if granularity not in ["daily", "weekly", "monthly"]:
+        raise HTTPException(status_code=400, detail="granularity must be one of: daily, weekly, monthly")
+
+    for d_str in [date, date2]:
+        if d_str:
+            try:
+                datetime.fromisoformat(d_str)
+            except ValueError as e:
+                raise HTTPException(status_code=400, detail=f"Invalid date format. Use YYYY-MM-DD: {e}")
+
+    terms = [t.strip() for t in types.split(",") if t.strip()]
+    if not terms:
+        raise HTTPException(status_code=400, detail="At least one term is required")
+
+    systems_input: Dict[str, Dict] = {}
+    if date:
+        systems_input["system1"] = {"date": date, "location": location}
+    if date2:
+        systems_input["system2"] = {"date": date2, "location": location2 or location}
+    if not systems_input:
+        raise HTTPException(status_code=400, detail="At least one of date or date2 must be provided")
+
+    query = select(Datalake).where(Datalake.dataset_id == "wikigrams")
+    result = await db.execute(query)
+    datalake = result.scalar_one_or_none()
+
+    if not datalake:
+        raise HTTPException(status_code=404, detail="Wikigrams datalake not found")
+
+    granularity_mapping = {
+        "daily": ("wikigrams", "date"),
+        "weekly": ("wikigrams_weekly", "week"),
+        "monthly": ("wikigrams_monthly", "month")
+    }
+    table_name, time_column = granularity_mapping[granularity]
+
+    has_top_articles = (
+        granularity == "daily"
+        and bool(datalake.data_schema and "top_articles" in datalake.data_schema)
+    )
+
+    try:
+        duckdb_client = get_duckdb_client()
+        conn = duckdb_client.connect()
+
+        if not datalake.tables_metadata:
+            raise HTTPException(status_code=500, detail="Datalake metadata is missing.")
+
+        if table_name not in datalake.tables_metadata:
+            available = [k for k in datalake.tables_metadata.keys() if k.startswith("wikigrams")]
+            raise HTTPException(status_code=400, detail=f"Table '{table_name}' not found. Available: {available}.")
+
+        t_paths = time.time()
+        wikigrams_path_all, adapter_path = get_parquet_paths(datalake, table_name)
+        t_paths_ms = (time.time() - t_paths) * 1000
+
+        # Build a prefix index once: parent_dir → [paths].
+        # Lookup is O(N_paths) to build, O(1) per partition — avoids the
+        # O(N_paths × N_partitions) linear scan of the old list comprehension.
+        path_prefix_index: Dict[str, List[str]] = {}
+        for p in wikigrams_path_all:
+            dir_path = p.rsplit("/", 1)[0]
+            path_prefix_index.setdefault(dir_path, []).append(p)
+
+        placeholders = ",".join(["?" for _ in terms])
+        start_time = time.time()
+
+        # --- Resolve unique locations to local_geo (one adapter lookup per unique location) ---
+        unique_locations = {s["location"] for s in systems_input.values()}
+        geo_map: Dict[str, str] = {}
+        t_adapter = time.time()
+        for loc in unique_locations:
+            row = conn.execute(
+                "SELECT local_id FROM read_parquet(?) WHERE entity_id = ? LIMIT 1",
+                [adapter_path, loc]
+            ).fetchone()
+            if not row:
+                raise HTTPException(status_code=400, detail=f"Location '{loc}' not found in adapter")
+            geo_map[loc] = quote(row[0], safe='')
+        t_adapter_ms = (time.time() - t_adapter) * 1000
+
+        # --- Compute per-system window metadata ---
+        window_unit_days = {"daily": 1, "weekly": 7, "monthly": 30}[granularity]
+        effective_window = window_size * window_unit_days
+
+        per_system: Dict[str, Dict] = {}
+        t_filter = time.time()
+        for sys_key, system in systems_input.items():
+            loc = system["location"]
+            local_geo = geo_map[loc]
+            focus_date = datetime.fromisoformat(system["date"])
+            w_start = (focus_date - timedelta(days=effective_window)).strftime("%Y-%m-%d")
+            w_end = (focus_date + timedelta(days=effective_window)).strftime("%Y-%m-%d")
+            window_partitions = compute_partition_starts(w_start, w_end, granularity)
+            focus_partition = compute_partition_starts(system["date"], system["date"], granularity)[0]
+
+            # O(1) lookup per partition using prefix index
+            base = f"{datalake.data_location}/{table_name}/geo={local_geo}"
+            query_paths = []
+            for ps in window_partitions:
+                query_paths.extend(path_prefix_index.get(f"{base}/{time_column}={ps}", []))
+
+            if not query_paths:
+                raise HTTPException(status_code=404, detail=f"No data found for {sys_key} ({system['date']}, {loc})")
+
+            focus_paths = path_prefix_index.get(f"{base}/{time_column}={focus_partition}", [])
+
+            per_system[sys_key] = {
+                "loc": loc,
+                "focus_date_str": system["date"],
+                "window_partitions": window_partitions,
+                "window_set": set(window_partitions),
+                "focus_partition": focus_partition,
+                "query_paths": query_paths,
+                "focus_paths": focus_paths,
+            }
+        t_filter_ms = (time.time() - t_filter) * 1000
+
+        # --- Choose scan strategy ---
+        all_geos = {geo_map[s["location"]] for s in systems_input.values()}
+        temporal_comparison = len(systems_input) == 2 and len(all_geos) == 1
+        print(f"  setup: get_paths={t_paths_ms:.0f}ms, adapter={t_adapter_ms:.0f}ms, filter={t_filter_ms:.0f}ms | total_paths={len(wikigrams_path_all)}")
+
+        system_results: Dict[str, Dict] = {}
+
+        if temporal_comparison:
+            # TEMPORAL: same geo, different dates → merge paths, single DuckDB scan.
+            # top_articles is split into a separate query against focus-date files only
+            # so the large text column is not read from all 111 sparkline files.
+            s1 = per_system["system1"]
+            s2 = per_system["system2"]
+            combined_paths = sorted(set(s1["query_paths"]) | set(s2["query_paths"]))
+            range_start = min(s1["window_partitions"][0], s2["window_partitions"][0])
+            range_end = max(s1["window_partitions"][-1], s2["window_partitions"][-1])
+
+            # 1. Sparklines query — no top_articles column (small columns only)
+            spark_sql = f"""
+                SELECT
+                    w.types,
+                    w.{time_column},
+                    MIN(w.rank) AS rank,
+                    SUM(w.counts) AS counts
+                FROM read_parquet(?) w
+                WHERE w.{time_column} BETWEEN ? AND ?
+                  AND w.types IN ({placeholders})
+                GROUP BY w.types, w.{time_column}
+                ORDER BY w.types, w.{time_column}
+            """
+            t_query = time.time()
+            cursor = conn.execute(spark_sql, [combined_paths, range_start, range_end] + terms)
+            t_spark_ms = (time.time() - t_query) * 1000
+
+            rows = cursor.fetchall()
+            cols = [desc[0] for desc in cursor.description]
+
+            # Initialize both systems
+            for sys_key, meta in per_system.items():
+                system_results[sys_key] = {
+                    "date": meta["focus_date_str"],
+                    "location": meta["loc"],
+                    "sparkData": {t: [] for t in terms},
+                    "topArticles": {},
+                }
+
+            # Split rows by window membership (a date can fall in both windows if they overlap)
+            for row in rows:
+                d = dict(zip(cols, row))
+                term = d["types"]
+                date_val = str(d[time_column])
+                point = {time_column: d[time_column], "rank": d["rank"], "counts": d["counts"]}
+                if date_val in s1["window_set"]:
+                    system_results["system1"]["sparkData"][term].append(point)
+                if date_val in s2["window_set"]:
+                    system_results["system2"]["sparkData"][term].append(point)
+
+            # 2. top_articles query — focus-date files only (1-2 files)
+            t_articles = time.time()
+            if has_top_articles:
+                focus_paths = sorted(set(s1["focus_paths"]) | set(s2["focus_paths"]))
+                if focus_paths:
+                    try:
+                        art_cursor = conn.execute(f"""
+                            SELECT
+                                w.types,
+                                ANY_VALUE(w.top_articles) FILTER (WHERE w.{time_column} = ?) AS top_articles_s1,
+                                ANY_VALUE(w.top_articles) FILTER (WHERE w.{time_column} = ?) AS top_articles_s2
+                            FROM read_parquet(?) w
+                            WHERE w.types IN ({placeholders})
+                            GROUP BY w.types
+                        """, [s1["focus_partition"], s2["focus_partition"], focus_paths] + terms)
+                        for row in art_cursor.fetchall():
+                            d = dict(zip([c[0] for c in art_cursor.description], row))
+                            if d.get("top_articles_s1") is not None:
+                                system_results["system1"]["topArticles"][d["types"]] = d["top_articles_s1"]
+                            if d.get("top_articles_s2") is not None:
+                                system_results["system2"]["topArticles"][d["types"]] = d["top_articles_s2"]
+                    except Exception:
+                        pass  # top_articles unavailable; sparklines already populated
+            t_articles_ms = (time.time() - t_articles) * 1000
+
+            print(f"  temporal: {len(combined_paths)} paths, spark={t_spark_ms:.0f}ms, articles={t_articles_ms:.0f}ms ({len(s1['focus_paths'])+len(s2['focus_paths'])} focus files)")
+
+        else:
+            # GEOGRAPHIC (or single system): separate scan per system.
+            # Same split as temporal: sparklines without top_articles, then tiny articles query.
+            for sys_key, meta in per_system.items():
+                query_paths = meta["query_paths"]
+
+                # 1. Sparklines — small columns only
+                t_query = time.time()
+                cursor = conn.execute(f"""
+                    SELECT
+                        w.types,
+                        w.{time_column},
+                        MIN(w.rank) AS rank,
+                        SUM(w.counts) AS counts
+                    FROM read_parquet(?) w
+                    WHERE w.{time_column} BETWEEN ? AND ?
+                      AND w.types IN ({placeholders})
+                    GROUP BY w.types, w.{time_column}
+                    ORDER BY w.types, w.{time_column}
+                """, [query_paths, meta["window_partitions"][0], meta["window_partitions"][-1]] + terms)
+                t_query_ms = (time.time() - t_query) * 1000
+
+                rows = cursor.fetchall()
+                cols = [desc[0] for desc in cursor.description]
+
+                spark_data: Dict[str, List[Dict]] = {t: [] for t in terms}
+                for row in rows:
+                    d = dict(zip(cols, row))
+                    spark_data[d["types"]].append({
+                        time_column: d[time_column],
+                        "rank": d["rank"],
+                        "counts": d["counts"],
+                    })
+
+                # 2. top_articles — focus-date file(s) only
+                top_articles: Dict[str, Any] = {}
+                t_articles = time.time()
+                if has_top_articles and meta["focus_paths"]:
+                    try:
+                        art_cursor = conn.execute(f"""
+                            SELECT
+                                w.types,
+                                ANY_VALUE(w.top_articles) AS top_articles
+                            FROM read_parquet(?) w
+                            WHERE w.{time_column} = ?
+                              AND w.types IN ({placeholders})
+                            GROUP BY w.types
+                        """, [meta["focus_paths"], meta["focus_partition"]] + terms)
+                        for row in art_cursor.fetchall():
+                            d = dict(zip([c[0] for c in art_cursor.description], row))
+                            if d.get("top_articles") is not None:
+                                top_articles[d["types"]] = d["top_articles"]
+                    except Exception:
+                        pass  # top_articles unavailable; sparklines already populated
+                t_articles_ms = (time.time() - t_articles) * 1000
+
+                system_results[sys_key] = {
+                    "date": meta["focus_date_str"],
+                    "location": meta["loc"],
+                    "sparkData": spark_data,
+                    "topArticles": top_articles,
+                }
+                print(f"  {sys_key}: {len(query_paths)} paths, spark={t_query_ms:.0f}ms, articles={t_articles_ms:.0f}ms ({len(meta['focus_paths'])} focus files)")
+
+        duration = (time.time() - start_time) * 1000
+        print(f"searchTermsBatch total={duration:.2f}ms — {'temporal' if temporal_comparison else 'geographic'} for {len(terms)} terms × {len(systems_input)} systems")
+
+        return {**system_results, "duration": duration}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Query execution failed: {str(e)}")
+
+
 @router.get("/{dataset_id}")
 async def get_datalake_info(
     dataset_id: str,
@@ -294,16 +592,22 @@ async def get_adapter_info(
                 detail="Datalake metadata is missing. Please re-register the datalake with proper tables_metadata."
             )
         
-        adapter_fnames = datalake.tables_metadata.get("adapter")
-
-        if not adapter_fnames:
-            raise HTTPException(
-                status_code=500,
-                detail="Missing babynames or adapter file paths. Required: tables_metadata.babynames and tables_metadata.adapter"
-            )
-
-        adapter_path = [
-            f"{datalake.data_location}/{datalake.ducklake_data_path}/main/adapter/{fname}" for fname in adapter_fnames
+        if datalake.data_format == "parquet_hive":
+            if not datalake.entity_mapping or not datalake.entity_mapping.get("path"):
+                raise HTTPException(
+                    status_code=500,
+                    detail="Missing entity_mapping.path for parquet_hive format."
+                )
+            adapter_path = [datalake.entity_mapping["path"]]
+        else:
+            adapter_fnames = datalake.tables_metadata.get("adapter")
+            if not adapter_fnames:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Missing adapter file paths. Required: tables_metadata.adapter"
+                )
+            adapter_path = [
+                f"{datalake.data_location}/{datalake.ducklake_data_path}/main/adapter/{fname}" for fname in adapter_fnames
             ]
 
         # Execute comparative queries
@@ -626,15 +930,13 @@ async def get_wikigrams_top_ngrams(
                         w.types,
                         SUM(w.counts) as counts
                     FROM read_parquet(?) w
-                    LEFT JOIN read_parquet(?) a ON w.geo = a.local_id
                     WHERE w.{time_column} BETWEEN ? AND ?
-                      AND a.entity_id = ?
                     GROUP BY w.types
                     ORDER BY counts DESC
                     LIMIT ?
                 """
 
-                params = [filtered_wikigrams_path, adapter_path, date_range[0], date_range[1], location, limit]
+                params = [filtered_wikigrams_path, date_range[0], date_range[1], limit]
 
                 cursor = conn.execute(sql_query, params)
                 query_results = cursor.fetchall()

From ad6c0be517bc7400aebd98293d6a1fb458ec8057 Mon Sep 17 00:00:00 2001
From: Jonathan St-Onge <jstonge1@uvm.edu>
Date: Wed, 4 Mar 2026 15:15:31 -0500
Subject: [PATCH 2/4] allotax endpoint

---
 backend/app/main.py                  |   3 +-
 backend/app/routers/storywrangler.py | 183 +++++++++++++++++++++++++++
 2 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 backend/app/routers/storywrangler.py

diff --git a/backend/app/main.py b/backend/app/main.py
index afe6c2b..c3dd4b1 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -13,7 +13,7 @@
 limiter = Limiter(key_func=get_remote_address)
 
 # Import routers after limiter is defined so they can use it
-from .routers import open_academic_analytics, datasets, auth, wikimedia, annotations, dark_data_survey, scisciDB, datalakes, interdisciplinarity
+from .routers import open_academic_analytics, datasets, auth, wikimedia, annotations, dark_data_survey, scisciDB, datalakes, interdisciplinarity, storywrangler
 
 app = FastAPI(
     title=settings.app_name,
@@ -89,6 +89,7 @@ async def shutdown_event():
 app.include_router(scisciDB.router, prefix="/scisciDB", tags=["scisciDB"])
 app.include_router(dark_data_survey.router, prefix="", tags=["dark-data-survey"])
 app.include_router(interdisciplinarity.router, prefix="", tags=["interdisciplinarity"])
+app.include_router(storywrangler.router, prefix="/storywrangler", tags=["storywrangler"])
 app.include_router(admin.router, prefix="/admin", tags=["admin"], include_in_schema=False)
 
 # Admin endpoints (secured with admin authentication)
diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py
new file mode 100644
index 0000000..1ededc7
--- /dev/null
+++ b/backend/app/routers/storywrangler.py
@@ -0,0 +1,183 @@
+"""
+Storywrangler API — instruments from the Vermont Complex Systems Institute.
+
+Currently includes:
+  - Allotaxonometer: rank-turbulence divergence between two ngram distributions.
+"""
+
+from fastapi import APIRouter, HTTPException, Depends, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+from typing import List, Optional
+from urllib.parse import quote
+
+from ..core.database import get_db_session
+from ..core.duckdb_client import get_duckdb_client
+from ..models.datalakes import Datalake
+from .datalakes import get_parquet_paths, compute_partition_starts
+
+router = APIRouter()
+
+
+def _load_ngrams(conn, datalake, table_name: str, time_column: str,
+                 date_range: List[str], location: str, limit: int) -> dict:
+    """Load top ngrams for one (date_range, location) system from a datalake.
+
+    Returns a dict with "types" and "counts" lists ready for allotax input.
+    """
+    ngrams_path, adapter_path = get_parquet_paths(datalake, table_name)
+
+    # Resolve entity_id → local_id via adapter
+    adapter_row = conn.execute(
+        "SELECT local_id FROM read_parquet(?) WHERE entity_id = ? LIMIT 1",
+        [adapter_path, location]
+    ).fetchone()
+    if not adapter_row:
+        raise HTTPException(status_code=400, detail=f"Location '{location}' not found in adapter")
+    local_geo = quote(adapter_row[0], safe='')
+
+    # Filter to relevant partition directories
+    granularity = {v[1]: k for k, v in {
+        "daily": ("wikigrams", "date"),
+        "weekly": ("wikigrams_weekly", "week"),
+        "monthly": ("wikigrams_monthly", "month"),
+    }.items()}.get(time_column, "daily")
+    partition_starts = compute_partition_starts(date_range[0], date_range[1], granularity)
+
+    filtered_path = [
+        p for p in ngrams_path
+        if any(f"{time_column}={ps}" in p for ps in partition_starts)
+        and f"geo={local_geo}" in p
+    ]
+
+    if not filtered_path:
+        raise HTTPException(
+            status_code=400,
+            detail=f"No data found for {date_range[0]} to {date_range[1]} / location {location}"
+        )
+
+    sql = f"""
+        SELECT w.types, SUM(w.counts) AS counts
+        FROM read_parquet(?) w
+        LEFT JOIN read_parquet(?) a ON w.geo = a.local_id
+        WHERE w.{time_column} BETWEEN ? AND ?
+          AND a.entity_id = ?
+        GROUP BY w.types
+        ORDER BY counts DESC
+        LIMIT ?
+    """
+    rows = conn.execute(sql, [filtered_path, adapter_path,
+                               date_range[0], date_range[1], location, limit]).fetchall()
+
+    types = [r[0] for r in rows]
+    counts = [float(r[1]) for r in rows]
+    return {"types": types, "counts": counts}
+
+
+@router.get("/allotax")
+async def allotax_endpoint(
+    # System 1
+    dates: str = Query(..., description="Date range for system 1, e.g. '2024-01-01,2024-01-31'"),
+    location: str = Query(..., description="Location entity ID for system 1, e.g. 'wikidata:Q30'"),
+    # System 2
+    dates2: str = Query(..., description="Date range for system 2, e.g. '2024-02-01,2024-02-28'"),
+    location2: str = Query(..., description="Location entity ID for system 2, e.g. 'wikidata:Q16'"),
+    # Dataset
+    dataset: str = Query("wikigrams", description="Datalake dataset_id to query"),
+    granularity: str = Query("daily", description="Partition granularity: daily, weekly, monthly"),
+    # Allotax params
+    alpha: float = Query(1.0, description="RTD alpha parameter"),
+    alphas: Optional[str] = Query(None, description="Comma-separated alphas for multi-alpha mode, e.g. '0.5,1.0,2.0,3.0'"),
+    # Response shaping
+    ngram_limit: int = Query(10000, description="Max ngrams to load per system before computing"),
+    wordshift_limit: int = Query(200, description="Truncate wordshift output to top N entries"),
+    db: AsyncSession = Depends(get_db_session),
+):
+    """Compute allotaxonometer (rank-turbulence divergence) between two ngram distributions.
+
+    Loads raw ngrams server-side from the specified datalake, runs the full allotax
+    pipeline in Rust via PyO3, and returns lean visualization data (~30-50KB).
+
+    Response shape (single alpha):
+      normalization, diamond_counts, max_delta_loss, wordshift, balance, alpha, meta
+
+    Response shape (multi-alpha via `alphas`):
+      balance, alpha_results[{alpha, normalization, diamond_counts, max_delta_loss, wordshift}], meta
+
+    Alpha slider pattern — precompute a discrete set of alphas in one call:
+      /allotax?alphas=0.33,0.5,1.0,2.0,3.0&...
+      Then use adaptMultiAlphaResult() from allotaxonometer-ui to drive the slider client-side.
+    """
+    if granularity not in ("daily", "weekly", "monthly"):
+        raise HTTPException(status_code=400, detail="granularity must be daily, weekly, or monthly")
+
+    granularity_map = {
+        "daily": ("wikigrams", "date"),
+        "weekly": ("wikigrams_weekly", "week"),
+        "monthly": ("wikigrams_monthly", "month"),
+    }
+    table_name, time_column = granularity_map[granularity]
+
+    def parse_range(s: str) -> List[str]:
+        parts = s.split(",")
+        return [parts[0], parts[0]] if len(parts) == 1 else [parts[0], parts[1]]
+
+    dr1 = parse_range(dates)
+    dr2 = parse_range(dates2)
+
+    # Look up datalake
+    result = await db.execute(select(Datalake).where(Datalake.dataset_id == dataset))
+    datalake = result.scalar_one_or_none()
+    if not datalake:
+        raise HTTPException(status_code=404, detail=f"Datalake '{dataset}' not found")
+
+    if not datalake.tables_metadata or table_name not in datalake.tables_metadata:
+        available = list(datalake.tables_metadata.keys()) if datalake.tables_metadata else []
+        raise HTTPException(
+            status_code=400,
+            detail=f"Table '{table_name}' not available. Found: {available}"
+        )
+
+    try:
+        import allotax
+    except ImportError:
+        raise HTTPException(
+            status_code=503,
+            detail="allotax module not available. Install via: cd allotaxonometer-core/crates/allotax-py && maturin develop --release"
+        )
+
+    try:
+        duckdb_client = get_duckdb_client()
+        conn = duckdb_client.connect()
+
+        sys1 = _load_ngrams(conn, datalake, table_name, time_column, dr1, location, ngram_limit)
+        sys2 = _load_ngrams(conn, datalake, table_name, time_column, dr2, location2, ngram_limit)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Data loading failed: {str(e)}")
+
+    # Compute allotax — package returns lean display data natively
+    try:
+        if alphas:
+            alpha_list = [float(a) for a in alphas.split(",")]
+            result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list)
+            for slice_ in result_data.get("alpha_results", []):
+                slice_["wordshift"] = slice_["wordshift"][:wordshift_limit]
+        else:
+            result_data = allotax.compute_allotax(sys1, sys2, alpha)
+            result_data["wordshift"] = result_data["wordshift"][:wordshift_limit]
+
+        return {
+            **result_data,
+            "meta": {
+                "system1": {"dates": dates, "location": location, "ngrams": len(sys1["types"])},
+                "system2": {"dates": dates2, "location": location2, "ngrams": len(sys2["types"])},
+                "dataset": dataset,
+                "granularity": granularity,
+            }
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Allotax computation failed: {str(e)}")

From 7e7fd639846243785e325c0ab68c203f2eebf6be Mon Sep 17 00:00:00 2001
From: Jonathan St-Onge <jstonge1@uvm.edu>
Date: Thu, 5 Mar 2026 07:09:22 -0500
Subject: [PATCH 3/4] bug in weekly granularity

---
 backend/app/routers/storywrangler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py
index 1ededc7..f574611 100644
--- a/backend/app/routers/storywrangler.py
+++ b/backend/app/routers/storywrangler.py
@@ -56,6 +56,8 @@ def _load_ngrams(conn, datalake, table_name: str, time_column: str,
             detail=f"No data found for {date_range[0]} to {date_range[1]} / location {location}"
         )
 
+    # Use snapped partition boundaries in SQL — raw input dates won't match stored
+    # week/month column values (e.g. input "2024-11-07" vs stored "2024-11-04").
     sql = f"""
         SELECT w.types, SUM(w.counts) AS counts
         FROM read_parquet(?) w
@@ -67,7 +69,7 @@ def _load_ngrams(conn, datalake, table_name: str, time_column: str,
         LIMIT ?
     """
     rows = conn.execute(sql, [filtered_path, adapter_path,
-                               date_range[0], date_range[1], location, limit]).fetchall()
+                               partition_starts[0], partition_starts[-1], location, limit]).fetchall()
 
     types = [r[0] for r in rows]
     counts = [float(r[1]) for r in rows]

From 66af3a68d23d3c53d3b7b9594fc0498fc5165ebb Mon Sep 17 00:00:00 2001
From: Jonathan St-Onge <jstonge1@uvm.edu>
Date: Thu, 5 Mar 2026 13:10:56 -0500
Subject: [PATCH 4/4] truncated in allotax core

---
 backend/app/routers/storywrangler.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py
index f574611..fd6360a 100644
--- a/backend/app/routers/storywrangler.py
+++ b/backend/app/routers/storywrangler.py
@@ -164,12 +164,10 @@ def parse_range(s: str) -> List[str]:
     try:
         if alphas:
             alpha_list = [float(a) for a in alphas.split(",")]
-            result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list)
-            for slice_ in result_data.get("alpha_results", []):
-                slice_["wordshift"] = slice_["wordshift"][:wordshift_limit]
+            result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list, wordshift_limit)
         else:
-            result_data = allotax.compute_allotax(sys1, sys2, alpha)
-            result_data["wordshift"] = result_data["wordshift"][:wordshift_limit]
+            result_data = allotax.compute_allotax(sys1, sys2, alpha, wordshift_limit)
+
 
         return {
             **result_data,