File tree Expand file tree Collapse file tree
backend/integrations/external Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ import asyncio
2+
3+ import structlog
4+ from llama_index .readers .web import ( # type: ignore[import-untyped] # noqa: E402
5+ BeautifulSoupWebReader ,
6+ )
7+
8+ logger = structlog .get_logger ()
9+
10+ MAX_CHARS = 12000
11+
12+
13+ class WebScraper :
14+ def __init__ (self ) -> None :
15+ self .reader = BeautifulSoupWebReader ()
16+
17+ async def scrape (self , url : str ) -> str | None :
18+ """Повертає текст сторінки або None при помилці/порожньому результаті."""
19+ logger .info ("web_scraper_started" , url = url )
20+ try :
21+ documents = await asyncio .to_thread (self .reader .load_data , urls = [url ])
22+ if not documents :
23+ logger .warning ("web_scraper_empty" , url = url )
24+ return None
25+
26+ content = documents [0 ].get_content ()[:MAX_CHARS ].strip ()
27+ logger .info ("web_scraper_success" , url = url , chars = len (content ))
28+ return f"--- Джерело: WEB ({ url } ) ---\n { content } "
29+ except Exception as e :
30+ logger .error ("web_scraper_failed" , url = url , error = str (e ))
31+ return None
You can’t perform that action at this time.
0 commit comments