Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions src/paperscout/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import logging
import re
import time
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from dataclasses import dataclass
from datetime import date, datetime, timedelta, timezone
from email.utils import parsedate_to_datetime
from types import MappingProxyType

import httpx

Expand All @@ -28,7 +29,15 @@


class WG21Index:
"""Fetch, cache, and parse the wg21.link paper index."""
"""Fetch, cache, and parse the wg21.link paper index.

This class is designed for single-threaded async use. Do not access
from the Bolt daemon thread.

Likewise, do not access from the MessageQueue sender thread or any
other sync worker — ``refresh()`` mutates internal state and would
race with cross-thread reads on ``papers`` / ``_max_rev``.
"""

def __init__(self, pool, cfg: Settings | None = None):
self._cfg = cfg or settings
Expand Down Expand Up @@ -132,15 +141,27 @@ def effective_frontier(
return nums[i]
return nums[0]

def latest_revision(self, number: int) -> int | None:
"""Highest revision R*n* seen in the index for P-number *number*."""
rev = self._max_rev.get(number)
def get_max_revision(self, paper_number: int) -> int | None:
"""Highest revision *R*n* seen in the index for P-number *paper_number*.

Returns ``None`` if the number is unknown or only a sentinel ``-1``
revision is recorded (treated as no published revision in the index).
"""
rev = self._max_rev.get(paper_number)
return rev if rev is not None and rev >= 0 else None

def known_p_numbers(self) -> set[int]:
"""Set of all P-numbers present in the wg21.link index."""
return set(self._max_rev.keys())

def get_known_paper_ids(self) -> frozenset[str]:
"""Immutable snapshot of all paper ids currently in the index."""
return frozenset(self.papers)

def get_papers_snapshot(self) -> Mapping[str, Paper]:
"""Read-only mapping copy of ``papers`` (not the live dict object)."""
return MappingProxyType(dict(self.papers))


# ═══════════════════════════════════════════════════════════════════════════
# ISO Paper Prober
Expand Down Expand Up @@ -251,6 +272,7 @@ async def run_cycle(self) -> list[ProbeHit]:
t0 = time.monotonic()

urls = self._build_probe_list()
known_ids = self.index.get_known_paper_ids()
hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT))
cold_count = sum(1 for u in urls if u[1] == Tier.COLD)
slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor
Expand All @@ -273,7 +295,7 @@ async def run_cycle(self) -> list[ProbeHit]:
follow_redirects=True,
) as client:
tasks = [
self._probe_one(client, sem, url, prefix, num, rev, ext, tier)
self._probe_one(client, sem, url, prefix, num, rev, ext, tier, known_ids)
for url, tier, prefix, num, rev, ext in urls
]
results = await asyncio.gather(*tasks, return_exceptions=True)
Expand Down Expand Up @@ -340,7 +362,7 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]:
# Recently active papers
if self.cfg.hot_lookback_months > 0:
cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44))
for p in self.index.papers.values():
for p in self.index.get_papers_snapshot().values():
if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown":
continue
try:
Expand Down Expand Up @@ -376,7 +398,7 @@ def _build_hot_list(
# Known hot: probe D prefix, latest+1 .. latest+hot_revision_depth
for num in sorted(hot_known):
tier = self._tier_label(num, watchlist_set, frontier_range)
latest = self.index.latest_revision(num)
latest = self.index.get_max_revision(num)
if latest is None:
latest = -1
for rev in range(latest + 1, latest + self.cfg.hot_revision_depth + 1):
Expand Down Expand Up @@ -414,7 +436,7 @@ def _build_cold_slice(
for num in sorted(cold_known):
if num % self.cfg.cold_cycle_divisor != slice_idx:
continue
latest = self.index.latest_revision(num)
latest = self.index.get_max_revision(num)
if latest is None:
continue
for rev in range(latest + 1, latest + self.cfg.cold_revision_depth + 1):
Expand Down Expand Up @@ -446,13 +468,15 @@ async def _probe_one(
rev: int,
ext: str,
tier: Tier,
known_ids: frozenset[str] | None = None,
) -> ProbeHit | None:
if self.state.is_discovered(url):
log.debug("SKIP disc %s", url)
self._stats["skipped_discovered"] += 1
return None
paper_id = f"{prefix}{num:04d}R{rev}"
if paper_id in self.index.papers:
ids = known_ids if known_ids is not None else self.index.get_known_paper_ids()
if paper_id in ids:
log.debug("SKIP idx %s", paper_id)
self._stats["skipped_in_index"] += 1
return None
Expand Down
39 changes: 30 additions & 9 deletions tests/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,32 @@ def test_effective_frontier_extra_isolated_high_still_filtered(self, fake_pool):
index._parse_and_index({"P0100R0": {"title": "T"}, "P0120R0": {"title": "T"}})
assert index.effective_frontier(gap_threshold=50, extra_p_numbers={9999}) == 120

def test_latest_revision_known(self, populated_index):
assert populated_index.latest_revision(2300) == 10
def test_get_max_revision_known(self, populated_index):
assert populated_index.get_max_revision(2300) == 10

def test_latest_revision_unknown(self, populated_index):
assert populated_index.latest_revision(9999) is None
def test_get_max_revision_unknown(self, populated_index):
assert populated_index.get_max_revision(9999) is None

def test_get_papers_snapshot_and_known_ids_are_independent_snapshots(self, fake_pool):
from types import MappingProxyType

index = WG21Index(fake_pool)
index.papers = index._parse_and_index({"P1000R0": {"title": "T"}})
snap = index.get_papers_snapshot()
assert snap is not index.papers
assert isinstance(snap, MappingProxyType)
assert dict(snap) == dict(index.papers)
with pytest.raises(TypeError):
snap["X"] = Paper(id="X")

frozen = index.get_known_paper_ids()
assert isinstance(frozen, frozenset)
with pytest.raises(AttributeError):
frozen.add("X")

index.papers = index._parse_and_index({})
assert "P1000R0" in snap
assert snap["P1000R0"].id == "P1000R0"

def test_parse_ignores_non_dict_entries(self, fake_pool):
index = WG21Index(fake_pool)
Expand Down Expand Up @@ -667,7 +688,7 @@ def test_build_hot_list_explicit_ranges_update_frontier_range(self, fake_pool):
assert any(r[3] == 200 and r[1] == "frontier" for r in urls)

def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool):
"""Known hot numbers with latest_revision=None should start from R0."""
"""Known hot numbers with get_max_revision None should start from R0."""
prober, index, _ = self._make_prober(
fake_pool,
watchlist_nums=[9999],
Expand All @@ -677,7 +698,7 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool):
hot_revision_depth=1,
gap_max_rev=0,
)
# Add 9999 to _max_rev so it's "known" but with latest_revision=None
# Add 9999 to _max_rev so it's "known" but with get_max_revision None
index._max_rev = {9999: -1, 99: 0, 100: 0}
index._sorted_p_nums = [99, 100, 9999]
frontier = 100
Expand All @@ -688,7 +709,7 @@ def test_build_hot_list_latest_none_uses_minus_one(self, fake_pool):
assert 0 in revisions # latest=-1 → start_rev=0

def test_cold_known_skips_when_latest_none(self, fake_pool):
"""cold_known paper with latest_revision=None should be silently skipped."""
"""cold_known paper with get_max_revision None should be silently skipped."""
prober, index, _ = self._make_prober(
fake_pool,
hot_lookback_months=0,
Expand All @@ -697,15 +718,15 @@ def test_cold_known_skips_when_latest_none(self, fake_pool):
cold_cycle_divisor=1,
cold_revision_depth=1,
)
# 4 has _max_rev=-1 → latest_revision=None; 5 is normal
# 4 has _max_rev=-1 → get_max_revision None; 5 is normal
# With no frontier window and no watchlist, both are cold_known
index._max_rev = {4: -1, 5: 0}
index._sorted_p_nums = [4, 5]
frontier = 5
hot_known, hot_unknown = prober._hot_numbers(frontier)
urls = prober._build_cold_slice(1, frontier, hot_known, hot_unknown)
cold_nums = {r[3] for r in urls if r[1] == "cold"}
assert 4 not in cold_nums # skipped because latest_revision=None
assert 4 not in cold_nums # skipped because get_max_revision is None
assert 5 in cold_nums # normally probed

async def test_probe_one_bad_last_modified_header(self, fake_pool):
Expand Down