diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index ec598e4..e9b09a0 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -6,10 +6,11 @@ import logging import re import time -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from dataclasses import dataclass from datetime import date, datetime, timedelta, timezone from email.utils import parsedate_to_datetime +from types import MappingProxyType import httpx @@ -28,7 +29,15 @@ class WG21Index: - """Fetch, cache, and parse the wg21.link paper index.""" + """Fetch, cache, and parse the wg21.link paper index. + + This class is designed for single-threaded async use. Do not access + from the Bolt daemon thread. + + Likewise, do not access from the MessageQueue sender thread or any + other sync worker — ``refresh()`` mutates internal state and would + race with cross-thread reads on ``papers`` / ``_max_rev``. + """ def __init__(self, pool, cfg: Settings | None = None): self._cfg = cfg or settings @@ -132,15 +141,27 @@ def effective_frontier( return nums[i] return nums[0] - def latest_revision(self, number: int) -> int | None: - """Highest revision R*n* seen in the index for P-number *number*.""" - rev = self._max_rev.get(number) + def get_max_revision(self, paper_number: int) -> int | None: + """Highest revision *R*n* seen in the index for P-number *paper_number*. + + Returns ``None`` if the number is unknown or only a sentinel ``-1`` + revision is recorded (treated as no published revision in the index). + """ + rev = self._max_rev.get(paper_number) return rev if rev is not None and rev >= 0 else None def known_p_numbers(self) -> set[int]: """Set of all P-numbers present in the wg21.link index.""" return set(self._max_rev.keys()) + def get_known_paper_ids(self) -> frozenset[str]: + """Immutable snapshot of all paper ids currently in the index.""" + return frozenset(self.papers) + + def get_papers_snapshot(self) -> Mapping[str, Paper]: + """Read-only mapping copy of ``papers`` (not the live dict object).""" + return MappingProxyType(dict(self.papers)) + # ═══════════════════════════════════════════════════════════════════════════ # ISO Paper Prober @@ -251,6 +272,7 @@ async def run_cycle(self) -> list[ProbeHit]: t0 = time.monotonic() urls = self._build_probe_list() + known_ids = self.index.get_known_paper_ids() hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT)) cold_count = sum(1 for u in urls if u[1] == Tier.COLD) slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor @@ -273,7 +295,7 @@ async def run_cycle(self) -> list[ProbeHit]: follow_redirects=True, ) as client: tasks = [ - self._probe_one(client, sem, url, prefix, num, rev, ext, tier) + self._probe_one(client, sem, url, prefix, num, rev, ext, tier, known_ids) for url, tier, prefix, num, rev, ext in urls ] results = await asyncio.gather(*tasks, return_exceptions=True) @@ -340,7 +362,7 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: # Recently active papers if self.cfg.hot_lookback_months > 0: cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44)) - for p in self.index.papers.values(): + for p in self.index.get_papers_snapshot().values(): if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown": continue try: @@ -376,7 +398,7 @@ def _build_hot_list( # Known hot: probe D prefix, latest+1 .. latest+hot_revision_depth for num in sorted(hot_known): tier = self._tier_label(num, watchlist_set, frontier_range) - latest = self.index.latest_revision(num) + latest = self.index.get_max_revision(num) if latest is None: latest = -1 for rev in range(latest + 1, latest + self.cfg.hot_revision_depth + 1): @@ -414,7 +436,7 @@ def _build_cold_slice( for num in sorted(cold_known): if num % self.cfg.cold_cycle_divisor != slice_idx: continue - latest = self.index.latest_revision(num) + latest = self.index.get_max_revision(num) if latest is None: continue for rev in range(latest + 1, latest + self.cfg.cold_revision_depth + 1): @@ -446,13 +468,15 @@ async def _probe_one( rev: int, ext: str, tier: Tier, + known_ids: frozenset[str] | None = None, ) -> ProbeHit | None: if self.state.is_discovered(url): log.debug("SKIP disc %s", url) self._stats["skipped_discovered"] += 1 return None paper_id = f"{prefix}{num:04d}R{rev}" - if paper_id in self.index.papers: + ids = known_ids if known_ids is not None else self.index.get_known_paper_ids() + if paper_id in ids: log.debug("SKIP idx %s", paper_id) self._stats["skipped_in_index"] += 1 return None diff --git a/tests/test_sources.py b/tests/test_sources.py index 71d50e9..30bdae9 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -249,11 +249,32 @@ def test_effective_frontier_extra_isolated_high_still_filtered(self, fake_pool): index._parse_and_index({"P0100R0": {"title": "T"}, "P0120R0": {"title": "T"}}) assert index.effective_frontier(gap_threshold=50, extra_p_numbers={9999}) == 120 - def test_latest_revision_known(self, populated_index): - assert populated_index.latest_revision(2300) == 10 + def test_get_max_revision_known(self, populated_index): + assert populated_index.get_max_revision(2300) == 10 - def test_latest_revision_unknown(self, populated_index): - assert populated_index.latest_revision(9999) is None + def test_get_max_revision_unknown(self, populated_index): + assert populated_index.get_max_revision(9999) is None + + def test_get_papers_snapshot_and_known_ids_are_independent_snapshots(self, fake_pool): + from types import MappingProxyType + + index = WG21Index(fake_pool) + index.papers = index._parse_and_index({"P1000R0": {"title": "T"}}) + snap = index.get_papers_snapshot() + assert snap is not index.papers + assert isinstance(snap, MappingProxyType) + assert dict(snap) == dict(index.papers) + with pytest.raises(TypeError): + snap["X"] = Paper(id="X") + + frozen = index.get_known_paper_ids() + assert isinstance(frozen, frozenset) + with pytest.raises(AttributeError): + frozen.add("X") + + index.papers = index._parse_and_index({}) + assert "P1000R0" in snap + assert snap["P1000R0"].id == "P1000R0" def test_parse_ignores_non_dict_entries(self, fake_pool): index = WG21Index(fake_pool) @@ -667,7 +688,7 @@ def test_build_hot_list_explicit_ranges_update_frontier_range(self, fake_pool): assert any(r[3] == 200 and r[1] == "frontier" for r in urls) def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): - """Known hot numbers with latest_revision=None should start from R0.""" + """Known hot numbers with get_max_revision None should start from R0.""" prober, index, _ = self._make_prober( fake_pool, watchlist_nums=[9999], @@ -677,7 +698,7 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): hot_revision_depth=1, gap_max_rev=0, ) - # Add 9999 to _max_rev so it's "known" but with latest_revision=None + # Add 9999 to _max_rev so it's "known" but with get_max_revision None index._max_rev = {9999: -1, 99: 0, 100: 0} index._sorted_p_nums = [99, 100, 9999] frontier = 100 @@ -688,7 +709,7 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool): assert 0 in revisions # latest=-1 → start_rev=0 def test_cold_known_skips_when_latest_none(self, fake_pool): - """cold_known paper with latest_revision=None should be silently skipped.""" + """cold_known paper with get_max_revision None should be silently skipped.""" prober, index, _ = self._make_prober( fake_pool, hot_lookback_months=0, @@ -697,7 +718,7 @@ def test_cold_known_skips_when_latest_none(self, fake_pool): cold_cycle_divisor=1, cold_revision_depth=1, ) - # 4 has _max_rev=-1 → latest_revision=None; 5 is normal + # 4 has _max_rev=-1 → get_max_revision None; 5 is normal # With no frontier window and no watchlist, both are cold_known index._max_rev = {4: -1, 5: 0} index._sorted_p_nums = [4, 5] @@ -705,7 +726,7 @@ def test_cold_known_skips_when_latest_none(self, fake_pool): hot_known, hot_unknown = prober._hot_numbers(frontier) urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown) cold_nums = {r[3] for r in urls if r[1] == "cold"} - assert 4 not in cold_nums # skipped because latest_revision=None + assert 4 not in cold_nums # skipped because get_max_revision is None assert 5 in cold_nums # normally probed async def test_probe_one_bad_last_modified_header(self, fake_pool):