From 508460a427b7b5691a902a8e0ef71ed542de3a4f Mon Sep 17 00:00:00 2001 From: Jonathan St-Onge Date: Wed, 4 Mar 2026 09:51:30 -0500 Subject: [PATCH 1/4] typos --- backend/app/routers/datalakes.py | 328 +++++++++++++++++++++++++++++-- 1 file changed, 315 insertions(+), 13 deletions(-) diff --git a/backend/app/routers/datalakes.py b/backend/app/routers/datalakes.py index bbd812a..1371c5a 100644 --- a/backend/app/routers/datalakes.py +++ b/backend/app/routers/datalakes.py @@ -229,6 +229,304 @@ async def register_datalake( } } +@router.get("/search-terms2") +async def search_terms_batch( + types: str = Query(..., description="Comma-separated list of ngram terms"), + date: Optional[str] = Query(None, description="First system focus date (YYYY-MM-DD)"), + date2: Optional[str] = Query(None, description="Second system focus date (YYYY-MM-DD)"), + location: str = Query("wikidata:Q30", description="First system location entity ID"), + location2: Optional[str] = Query(None, description="Second system location entity ID (defaults to location)"), + granularity: str = Query("daily", description="Granularity: daily, weekly, monthly"), + window_size: int = Query(7, description="Number of granularity periods before/after each focus date"), + db: AsyncSession = Depends(get_db_session) +): + """ + Batch sparkline lookup for multiple ngram terms across one or two systems. + + Two comparison modes: + - Temporal (date + date2, same location): ONE DuckDB scan — both windows' paths merged. + - Geographic (date, location + location2): TWO DuckDB scans — paths live in separate geo dirs. + + Results are keyed as system1/system2 so the frontend can render both sides + without coordinating parallel calls. + """ + if granularity not in ["daily", "weekly", "monthly"]: + raise HTTPException(status_code=400, detail="granularity must be one of: daily, weekly, monthly") + + for d_str in [date, date2]: + if d_str: + try: + datetime.fromisoformat(d_str) + except ValueError as e: + raise HTTPException(status_code=400, detail=f"Invalid date format. Use YYYY-MM-DD: {e}") + + terms = [t.strip() for t in types.split(",") if t.strip()] + if not terms: + raise HTTPException(status_code=400, detail="At least one term is required") + + systems_input: Dict[str, Dict] = {} + if date: + systems_input["system1"] = {"date": date, "location": location} + if date2: + systems_input["system2"] = {"date": date2, "location": location2 or location} + if not systems_input: + raise HTTPException(status_code=400, detail="At least one of date or date2 must be provided") + + query = select(Datalake).where(Datalake.dataset_id == "wikigrams") + result = await db.execute(query) + datalake = result.scalar_one_or_none() + + if not datalake: + raise HTTPException(status_code=404, detail="Wikigrams datalake not found") + + granularity_mapping = { + "daily": ("wikigrams", "date"), + "weekly": ("wikigrams_weekly", "week"), + "monthly": ("wikigrams_monthly", "month") + } + table_name, time_column = granularity_mapping[granularity] + + has_top_articles = ( + granularity == "daily" + and bool(datalake.data_schema and "top_articles" in datalake.data_schema) + ) + + try: + duckdb_client = get_duckdb_client() + conn = duckdb_client.connect() + + if not datalake.tables_metadata: + raise HTTPException(status_code=500, detail="Datalake metadata is missing.") + + if table_name not in datalake.tables_metadata: + available = [k for k in datalake.tables_metadata.keys() if k.startswith("wikigrams")] + raise HTTPException(status_code=400, detail=f"Table '{table_name}' not found. Available: {available}.") + + t_paths = time.time() + wikigrams_path_all, adapter_path = get_parquet_paths(datalake, table_name) + t_paths_ms = (time.time() - t_paths) * 1000 + + # Build a prefix index once: parent_dir → [paths]. + # Lookup is O(N_paths) to build, O(1) per partition — avoids the + # O(N_paths × N_partitions) linear scan of the old list comprehension. + path_prefix_index: Dict[str, List[str]] = {} + for p in wikigrams_path_all: + dir_path = p.rsplit("/", 1)[0] + path_prefix_index.setdefault(dir_path, []).append(p) + + placeholders = ",".join(["?" for _ in terms]) + start_time = time.time() + + # --- Resolve unique locations to local_geo (one adapter lookup per unique location) --- + unique_locations = {s["location"] for s in systems_input.values()} + geo_map: Dict[str, str] = {} + t_adapter = time.time() + for loc in unique_locations: + row = conn.execute( + "SELECT local_id FROM read_parquet(?) WHERE entity_id = ? LIMIT 1", + [adapter_path, loc] + ).fetchone() + if not row: + raise HTTPException(status_code=400, detail=f"Location '{loc}' not found in adapter") + geo_map[loc] = quote(row[0], safe='') + t_adapter_ms = (time.time() - t_adapter) * 1000 + + # --- Compute per-system window metadata --- + window_unit_days = {"daily": 1, "weekly": 7, "monthly": 30}[granularity] + effective_window = window_size * window_unit_days + + per_system: Dict[str, Dict] = {} + t_filter = time.time() + for sys_key, system in systems_input.items(): + loc = system["location"] + local_geo = geo_map[loc] + focus_date = datetime.fromisoformat(system["date"]) + w_start = (focus_date - timedelta(days=effective_window)).strftime("%Y-%m-%d") + w_end = (focus_date + timedelta(days=effective_window)).strftime("%Y-%m-%d") + window_partitions = compute_partition_starts(w_start, w_end, granularity) + focus_partition = compute_partition_starts(system["date"], system["date"], granularity)[0] + + # O(1) lookup per partition using prefix index + base = f"{datalake.data_location}/{table_name}/geo={local_geo}" + query_paths = [] + for ps in window_partitions: + query_paths.extend(path_prefix_index.get(f"{base}/{time_column}={ps}", [])) + + if not query_paths: + raise HTTPException(status_code=404, detail=f"No data found for {sys_key} ({system['date']}, {loc})") + + focus_paths = path_prefix_index.get(f"{base}/{time_column}={focus_partition}", []) + + per_system[sys_key] = { + "loc": loc, + "focus_date_str": system["date"], + "window_partitions": window_partitions, + "window_set": set(window_partitions), + "focus_partition": focus_partition, + "query_paths": query_paths, + "focus_paths": focus_paths, + } + t_filter_ms = (time.time() - t_filter) * 1000 + + # --- Choose scan strategy --- + all_geos = {geo_map[s["location"]] for s in systems_input.values()} + temporal_comparison = len(systems_input) == 2 and len(all_geos) == 1 + print(f" setup: get_paths={t_paths_ms:.0f}ms, adapter={t_adapter_ms:.0f}ms, filter={t_filter_ms:.0f}ms | total_paths={len(wikigrams_path_all)}") + + system_results: Dict[str, Dict] = {} + + if temporal_comparison: + # TEMPORAL: same geo, different dates → merge paths, single DuckDB scan. + # top_articles is split into a separate query against focus-date files only + # so the large text column is not read from all 111 sparkline files. + s1 = per_system["system1"] + s2 = per_system["system2"] + combined_paths = sorted(set(s1["query_paths"]) | set(s2["query_paths"])) + range_start = min(s1["window_partitions"][0], s2["window_partitions"][0]) + range_end = max(s1["window_partitions"][-1], s2["window_partitions"][-1]) + + # 1. Sparklines query — no top_articles column (small columns only) + spark_sql = f""" + SELECT + w.types, + w.{time_column}, + MIN(w.rank) AS rank, + SUM(w.counts) AS counts + FROM read_parquet(?) w + WHERE w.{time_column} BETWEEN ? AND ? + AND w.types IN ({placeholders}) + GROUP BY w.types, w.{time_column} + ORDER BY w.types, w.{time_column} + """ + t_query = time.time() + cursor = conn.execute(spark_sql, [combined_paths, range_start, range_end] + terms) + t_spark_ms = (time.time() - t_query) * 1000 + + rows = cursor.fetchall() + cols = [desc[0] for desc in cursor.description] + + # Initialize both systems + for sys_key, meta in per_system.items(): + system_results[sys_key] = { + "date": meta["focus_date_str"], + "location": meta["loc"], + "sparkData": {t: [] for t in terms}, + "topArticles": {}, + } + + # Split rows by window membership (a date can fall in both windows if they overlap) + for row in rows: + d = dict(zip(cols, row)) + term = d["types"] + date_val = str(d[time_column]) + point = {time_column: d[time_column], "rank": d["rank"], "counts": d["counts"]} + if date_val in s1["window_set"]: + system_results["system1"]["sparkData"][term].append(point) + if date_val in s2["window_set"]: + system_results["system2"]["sparkData"][term].append(point) + + # 2. top_articles query — focus-date files only (1-2 files) + t_articles = time.time() + if has_top_articles: + focus_paths = sorted(set(s1["focus_paths"]) | set(s2["focus_paths"])) + if focus_paths: + try: + art_cursor = conn.execute(f""" + SELECT + w.types, + ANY_VALUE(w.top_articles) FILTER (WHERE w.{time_column} = ?) AS top_articles_s1, + ANY_VALUE(w.top_articles) FILTER (WHERE w.{time_column} = ?) AS top_articles_s2 + FROM read_parquet(?) w + WHERE w.types IN ({placeholders}) + GROUP BY w.types + """, [s1["focus_partition"], s2["focus_partition"], focus_paths] + terms) + for row in art_cursor.fetchall(): + d = dict(zip([c[0] for c in art_cursor.description], row)) + if d.get("top_articles_s1") is not None: + system_results["system1"]["topArticles"][d["types"]] = d["top_articles_s1"] + if d.get("top_articles_s2") is not None: + system_results["system2"]["topArticles"][d["types"]] = d["top_articles_s2"] + except Exception: + pass # top_articles unavailable; sparklines already populated + t_articles_ms = (time.time() - t_articles) * 1000 + + print(f" temporal: {len(combined_paths)} paths, spark={t_spark_ms:.0f}ms, articles={t_articles_ms:.0f}ms ({len(s1['focus_paths'])+len(s2['focus_paths'])} focus files)") + + else: + # GEOGRAPHIC (or single system): separate scan per system. + # Same split as temporal: sparklines without top_articles, then tiny articles query. + for sys_key, meta in per_system.items(): + query_paths = meta["query_paths"] + + # 1. Sparklines — small columns only + t_query = time.time() + cursor = conn.execute(f""" + SELECT + w.types, + w.{time_column}, + MIN(w.rank) AS rank, + SUM(w.counts) AS counts + FROM read_parquet(?) w + WHERE w.{time_column} BETWEEN ? AND ? + AND w.types IN ({placeholders}) + GROUP BY w.types, w.{time_column} + ORDER BY w.types, w.{time_column} + """, [query_paths, meta["window_partitions"][0], meta["window_partitions"][-1]] + terms) + t_query_ms = (time.time() - t_query) * 1000 + + rows = cursor.fetchall() + cols = [desc[0] for desc in cursor.description] + + spark_data: Dict[str, List[Dict]] = {t: [] for t in terms} + for row in rows: + d = dict(zip(cols, row)) + spark_data[d["types"]].append({ + time_column: d[time_column], + "rank": d["rank"], + "counts": d["counts"], + }) + + # 2. top_articles — focus-date file(s) only + top_articles: Dict[str, Any] = {} + t_articles = time.time() + if has_top_articles and meta["focus_paths"]: + try: + art_cursor = conn.execute(f""" + SELECT + w.types, + ANY_VALUE(w.top_articles) AS top_articles + FROM read_parquet(?) w + WHERE w.{time_column} = ? + AND w.types IN ({placeholders}) + GROUP BY w.types + """, [meta["focus_paths"], meta["focus_partition"]] + terms) + for row in art_cursor.fetchall(): + d = dict(zip([c[0] for c in art_cursor.description], row)) + if d.get("top_articles") is not None: + top_articles[d["types"]] = d["top_articles"] + except Exception: + pass # top_articles unavailable; sparklines already populated + t_articles_ms = (time.time() - t_articles) * 1000 + + system_results[sys_key] = { + "date": meta["focus_date_str"], + "location": meta["loc"], + "sparkData": spark_data, + "topArticles": top_articles, + } + print(f" {sys_key}: {len(query_paths)} paths, spark={t_query_ms:.0f}ms, articles={t_articles_ms:.0f}ms ({len(meta['focus_paths'])} focus files)") + + duration = (time.time() - start_time) * 1000 + print(f"searchTermsBatch total={duration:.2f}ms — {'temporal' if temporal_comparison else 'geographic'} for {len(terms)} terms × {len(systems_input)} systems") + + return {**system_results, "duration": duration} + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Query execution failed: {str(e)}") + + @router.get("/{dataset_id}") async def get_datalake_info( dataset_id: str, @@ -294,16 +592,22 @@ async def get_adapter_info( detail="Datalake metadata is missing. Please re-register the datalake with proper tables_metadata." ) - adapter_fnames = datalake.tables_metadata.get("adapter") - - if not adapter_fnames: - raise HTTPException( - status_code=500, - detail="Missing babynames or adapter file paths. Required: tables_metadata.babynames and tables_metadata.adapter" - ) - - adapter_path = [ - f"{datalake.data_location}/{datalake.ducklake_data_path}/main/adapter/{fname}" for fname in adapter_fnames + if datalake.data_format == "parquet_hive": + if not datalake.entity_mapping or not datalake.entity_mapping.get("path"): + raise HTTPException( + status_code=500, + detail="Missing entity_mapping.path for parquet_hive format." + ) + adapter_path = [datalake.entity_mapping["path"]] + else: + adapter_fnames = datalake.tables_metadata.get("adapter") + if not adapter_fnames: + raise HTTPException( + status_code=500, + detail="Missing adapter file paths. Required: tables_metadata.adapter" + ) + adapter_path = [ + f"{datalake.data_location}/{datalake.ducklake_data_path}/main/adapter/{fname}" for fname in adapter_fnames ] # Execute comparative queries @@ -626,15 +930,13 @@ async def get_wikigrams_top_ngrams( w.types, SUM(w.counts) as counts FROM read_parquet(?) w - LEFT JOIN read_parquet(?) a ON w.geo = a.local_id WHERE w.{time_column} BETWEEN ? AND ? - AND a.entity_id = ? GROUP BY w.types ORDER BY counts DESC LIMIT ? """ - params = [filtered_wikigrams_path, adapter_path, date_range[0], date_range[1], location, limit] + params = [filtered_wikigrams_path, date_range[0], date_range[1], limit] cursor = conn.execute(sql_query, params) query_results = cursor.fetchall() From ad6c0be517bc7400aebd98293d6a1fb458ec8057 Mon Sep 17 00:00:00 2001 From: Jonathan St-Onge Date: Wed, 4 Mar 2026 15:15:31 -0500 Subject: [PATCH 2/4] allotax endpoint --- backend/app/main.py | 3 +- backend/app/routers/storywrangler.py | 183 +++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 backend/app/routers/storywrangler.py diff --git a/backend/app/main.py b/backend/app/main.py index afe6c2b..c3dd4b1 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -13,7 +13,7 @@ limiter = Limiter(key_func=get_remote_address) # Import routers after limiter is defined so they can use it -from .routers import open_academic_analytics, datasets, auth, wikimedia, annotations, dark_data_survey, scisciDB, datalakes, interdisciplinarity +from .routers import open_academic_analytics, datasets, auth, wikimedia, annotations, dark_data_survey, scisciDB, datalakes, interdisciplinarity, storywrangler app = FastAPI( title=settings.app_name, @@ -89,6 +89,7 @@ async def shutdown_event(): app.include_router(scisciDB.router, prefix="/scisciDB", tags=["scisciDB"]) app.include_router(dark_data_survey.router, prefix="", tags=["dark-data-survey"]) app.include_router(interdisciplinarity.router, prefix="", tags=["interdisciplinarity"]) +app.include_router(storywrangler.router, prefix="/storywrangler", tags=["storywrangler"]) app.include_router(admin.router, prefix="/admin", tags=["admin"], include_in_schema=False) # Admin endpoints (secured with admin authentication) diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py new file mode 100644 index 0000000..1ededc7 --- /dev/null +++ b/backend/app/routers/storywrangler.py @@ -0,0 +1,183 @@ +""" +Storywrangler API — instruments from the Vermont Complex Systems Institute. + +Currently includes: + - Allotaxonometer: rank-turbulence divergence between two ngram distributions. +""" + +from fastapi import APIRouter, HTTPException, Depends, Query +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select +from typing import List, Optional +from urllib.parse import quote + +from ..core.database import get_db_session +from ..core.duckdb_client import get_duckdb_client +from ..models.datalakes import Datalake +from .datalakes import get_parquet_paths, compute_partition_starts + +router = APIRouter() + + +def _load_ngrams(conn, datalake, table_name: str, time_column: str, + date_range: List[str], location: str, limit: int) -> dict: + """Load top ngrams for one (date_range, location) system from a datalake. + + Returns a dict with "types" and "counts" lists ready for allotax input. + """ + ngrams_path, adapter_path = get_parquet_paths(datalake, table_name) + + # Resolve entity_id → local_id via adapter + adapter_row = conn.execute( + "SELECT local_id FROM read_parquet(?) WHERE entity_id = ? LIMIT 1", + [adapter_path, location] + ).fetchone() + if not adapter_row: + raise HTTPException(status_code=400, detail=f"Location '{location}' not found in adapter") + local_geo = quote(adapter_row[0], safe='') + + # Filter to relevant partition directories + granularity = {v[1]: k for k, v in { + "daily": ("wikigrams", "date"), + "weekly": ("wikigrams_weekly", "week"), + "monthly": ("wikigrams_monthly", "month"), + }.items()}.get(time_column, "daily") + partition_starts = compute_partition_starts(date_range[0], date_range[1], granularity) + + filtered_path = [ + p for p in ngrams_path + if any(f"{time_column}={ps}" in p for ps in partition_starts) + and f"geo={local_geo}" in p + ] + + if not filtered_path: + raise HTTPException( + status_code=400, + detail=f"No data found for {date_range[0]} to {date_range[1]} / location {location}" + ) + + sql = f""" + SELECT w.types, SUM(w.counts) AS counts + FROM read_parquet(?) w + LEFT JOIN read_parquet(?) a ON w.geo = a.local_id + WHERE w.{time_column} BETWEEN ? AND ? + AND a.entity_id = ? + GROUP BY w.types + ORDER BY counts DESC + LIMIT ? + """ + rows = conn.execute(sql, [filtered_path, adapter_path, + date_range[0], date_range[1], location, limit]).fetchall() + + types = [r[0] for r in rows] + counts = [float(r[1]) for r in rows] + return {"types": types, "counts": counts} + + +@router.get("/allotax") +async def allotax_endpoint( + # System 1 + dates: str = Query(..., description="Date range for system 1, e.g. '2024-01-01,2024-01-31'"), + location: str = Query(..., description="Location entity ID for system 1, e.g. 'wikidata:Q30'"), + # System 2 + dates2: str = Query(..., description="Date range for system 2, e.g. '2024-02-01,2024-02-28'"), + location2: str = Query(..., description="Location entity ID for system 2, e.g. 'wikidata:Q16'"), + # Dataset + dataset: str = Query("wikigrams", description="Datalake dataset_id to query"), + granularity: str = Query("daily", description="Partition granularity: daily, weekly, monthly"), + # Allotax params + alpha: float = Query(1.0, description="RTD alpha parameter"), + alphas: Optional[str] = Query(None, description="Comma-separated alphas for multi-alpha mode, e.g. '0.5,1.0,2.0,3.0'"), + # Response shaping + ngram_limit: int = Query(10000, description="Max ngrams to load per system before computing"), + wordshift_limit: int = Query(200, description="Truncate wordshift output to top N entries"), + db: AsyncSession = Depends(get_db_session), +): + """Compute allotaxonometer (rank-turbulence divergence) between two ngram distributions. + + Loads raw ngrams server-side from the specified datalake, runs the full allotax + pipeline in Rust via PyO3, and returns lean visualization data (~30-50KB). + + Response shape (single alpha): + normalization, diamond_counts, max_delta_loss, wordshift, balance, alpha, meta + + Response shape (multi-alpha via `alphas`): + balance, alpha_results[{alpha, normalization, diamond_counts, max_delta_loss, wordshift}], meta + + Alpha slider pattern — precompute a discrete set of alphas in one call: + /allotax?alphas=0.33,0.5,1.0,2.0,3.0&... + Then use adaptMultiAlphaResult() from allotaxonometer-ui to drive the slider client-side. + """ + if granularity not in ("daily", "weekly", "monthly"): + raise HTTPException(status_code=400, detail="granularity must be daily, weekly, or monthly") + + granularity_map = { + "daily": ("wikigrams", "date"), + "weekly": ("wikigrams_weekly", "week"), + "monthly": ("wikigrams_monthly", "month"), + } + table_name, time_column = granularity_map[granularity] + + def parse_range(s: str) -> List[str]: + parts = s.split(",") + return [parts[0], parts[0]] if len(parts) == 1 else [parts[0], parts[1]] + + dr1 = parse_range(dates) + dr2 = parse_range(dates2) + + # Look up datalake + result = await db.execute(select(Datalake).where(Datalake.dataset_id == dataset)) + datalake = result.scalar_one_or_none() + if not datalake: + raise HTTPException(status_code=404, detail=f"Datalake '{dataset}' not found") + + if not datalake.tables_metadata or table_name not in datalake.tables_metadata: + available = list(datalake.tables_metadata.keys()) if datalake.tables_metadata else [] + raise HTTPException( + status_code=400, + detail=f"Table '{table_name}' not available. Found: {available}" + ) + + try: + import allotax + except ImportError: + raise HTTPException( + status_code=503, + detail="allotax module not available. Install via: cd allotaxonometer-core/crates/allotax-py && maturin develop --release" + ) + + try: + duckdb_client = get_duckdb_client() + conn = duckdb_client.connect() + + sys1 = _load_ngrams(conn, datalake, table_name, time_column, dr1, location, ngram_limit) + sys2 = _load_ngrams(conn, datalake, table_name, time_column, dr2, location2, ngram_limit) + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=f"Data loading failed: {str(e)}") + + # Compute allotax — package returns lean display data natively + try: + if alphas: + alpha_list = [float(a) for a in alphas.split(",")] + result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list) + for slice_ in result_data.get("alpha_results", []): + slice_["wordshift"] = slice_["wordshift"][:wordshift_limit] + else: + result_data = allotax.compute_allotax(sys1, sys2, alpha) + result_data["wordshift"] = result_data["wordshift"][:wordshift_limit] + + return { + **result_data, + "meta": { + "system1": {"dates": dates, "location": location, "ngrams": len(sys1["types"])}, + "system2": {"dates": dates2, "location": location2, "ngrams": len(sys2["types"])}, + "dataset": dataset, + "granularity": granularity, + } + } + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Allotax computation failed: {str(e)}") From 7e7fd639846243785e325c0ab68c203f2eebf6be Mon Sep 17 00:00:00 2001 From: Jonathan St-Onge Date: Thu, 5 Mar 2026 07:09:22 -0500 Subject: [PATCH 3/4] bug in weekly granularity --- backend/app/routers/storywrangler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py index 1ededc7..f574611 100644 --- a/backend/app/routers/storywrangler.py +++ b/backend/app/routers/storywrangler.py @@ -56,6 +56,8 @@ def _load_ngrams(conn, datalake, table_name: str, time_column: str, detail=f"No data found for {date_range[0]} to {date_range[1]} / location {location}" ) + # Use snapped partition boundaries in SQL — raw input dates won't match stored + # week/month column values (e.g. input "2024-11-07" vs stored "2024-11-04"). sql = f""" SELECT w.types, SUM(w.counts) AS counts FROM read_parquet(?) w @@ -67,7 +69,7 @@ def _load_ngrams(conn, datalake, table_name: str, time_column: str, LIMIT ? """ rows = conn.execute(sql, [filtered_path, adapter_path, - date_range[0], date_range[1], location, limit]).fetchall() + partition_starts[0], partition_starts[-1], location, limit]).fetchall() types = [r[0] for r in rows] counts = [float(r[1]) for r in rows] From 66af3a68d23d3c53d3b7b9594fc0498fc5165ebb Mon Sep 17 00:00:00 2001 From: Jonathan St-Onge Date: Thu, 5 Mar 2026 13:10:56 -0500 Subject: [PATCH 4/4] truncated in allotax core --- backend/app/routers/storywrangler.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/app/routers/storywrangler.py b/backend/app/routers/storywrangler.py index f574611..fd6360a 100644 --- a/backend/app/routers/storywrangler.py +++ b/backend/app/routers/storywrangler.py @@ -164,12 +164,10 @@ def parse_range(s: str) -> List[str]: try: if alphas: alpha_list = [float(a) for a in alphas.split(",")] - result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list) - for slice_ in result_data.get("alpha_results", []): - slice_["wordshift"] = slice_["wordshift"][:wordshift_limit] + result_data = allotax.compute_allotax_multi_alpha(sys1, sys2, alpha_list, wordshift_limit) else: - result_data = allotax.compute_allotax(sys1, sys2, alpha) - result_data["wordshift"] = result_data["wordshift"][:wordshift_limit] + result_data = allotax.compute_allotax(sys1, sys2, alpha, wordshift_limit) + return { **result_data,