Skip to content

Commit 159b226

Browse files
committed
feat(scraper): add WebScraper integration via FactChecker
- Add WebScraper in integrations/external — isolated from business logic - FactChecker._fetch_from_web() delegates to WebScraper.scrape()
1 parent 367947e commit 159b226

1 file changed

Lines changed: 31 additions & 0 deletions

File tree

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import asyncio
2+
3+
import structlog
4+
from llama_index.readers.web import ( # type: ignore[import-untyped] # noqa: E402
5+
BeautifulSoupWebReader,
6+
)
7+
8+
logger = structlog.get_logger()
9+
10+
MAX_CHARS = 12000
11+
12+
13+
class WebScraper:
14+
def __init__(self) -> None:
15+
self.reader = BeautifulSoupWebReader()
16+
17+
async def scrape(self, url: str) -> str | None:
18+
"""Повертає текст сторінки або None при помилці/порожньому результаті."""
19+
logger.info("web_scraper_started", url=url)
20+
try:
21+
documents = await asyncio.to_thread(self.reader.load_data, urls=[url])
22+
if not documents:
23+
logger.warning("web_scraper_empty", url=url)
24+
return None
25+
26+
content = documents[0].get_content()[:MAX_CHARS].strip()
27+
logger.info("web_scraper_success", url=url, chars=len(content))
28+
return f"--- Джерело: WEB ({url}) ---\n{content}"
29+
except Exception as e:
30+
logger.error("web_scraper_failed", url=url, error=str(e))
31+
return None

0 commit comments

Comments
 (0)