Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions hestia/hestia_utils/db.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import psycopg2
import logging
import json
from telegram import Chat
from typing import Literal
from datetime import datetime
Expand Down Expand Up @@ -118,6 +119,93 @@ def disable_dev_mode() -> None:
def update_donation_link(link: str) -> None:
_write("UPDATE hestia.meta SET donation_link = %s, donation_link_updated = now() WHERE id = 'default'", [link])


def upsert_error_rollup(
fingerprint: str,
component: str,
agency: str,
target_id: int,
error_class: str,
message: str,
sample: str,
context: dict | None = None,
) -> None:
_write(
"""
INSERT INTO hestia.error_rollups
(day, fingerprint, component, agency, target_id, error_class, message, sample, context, count, first_seen, last_seen)
VALUES
(CURRENT_DATE, %s, %s, %s, %s, %s, %s, %s, %s::jsonb, 1, now(), now())
ON CONFLICT (day, fingerprint)
DO UPDATE SET
count = hestia.error_rollups.count + 1,
last_seen = now(),
message = EXCLUDED.message,
sample = EXCLUDED.sample,
context = EXCLUDED.context
""",
[
fingerprint,
component,
agency,
target_id,
error_class,
message,
sample,
json.dumps(context or {}),
],
)


def get_recent_error_rollups(hours: int = 24, limit: int = 20) -> list[RealDictRow]:
return fetch_all(
"""
SELECT
fingerprint,
component,
agency,
target_id,
error_class,
MAX(message) AS message,
SUM(count) AS total_count,
MIN(first_seen) AS first_seen,
MAX(last_seen) AS last_seen
FROM hestia.error_rollups
WHERE last_seen >= now() - (%s::int * interval '1 hour')
GROUP BY fingerprint, component, agency, target_id, error_class
ORDER BY total_count DESC, MAX(last_seen) DESC
LIMIT %s
""",
[hours, limit],
)


def cleanup_error_rollups(retention_days: int = 30) -> None:
_write(
"DELETE FROM hestia.error_rollups WHERE day < CURRENT_DATE - %s::int",
[retention_days],
)


def get_enabled_targets_without_recent_homes(days: int = 7) -> list[RealDictRow]:
return fetch_all(
"""
SELECT
t.id,
t.agency,
COUNT(h.url) AS homes_count
FROM hestia.targets t
LEFT JOIN hestia.homes h
ON h.agency = t.agency
AND h.date_added >= now() - (%s::int * interval '1 day')
WHERE t.enabled = true
GROUP BY t.id, t.agency
HAVING COUNT(h.url) = 0
ORDER BY t.id
""",
[days],
)

def set_filter_minprice(telegram_chat: Chat, min_price: int) -> None:
_write("UPDATE hestia.subscribers SET filter_min_price = %s WHERE telegram_id = %s", [str(min_price), str(telegram_chat.id)])
def set_filter_maxprice(telegram_chat: Chat, max_price: int) -> None:
Expand Down
74 changes: 70 additions & 4 deletions hestia/scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import logging
import hashlib
import traceback
import requests
from time import sleep
from asyncio import run
Expand All @@ -12,10 +14,71 @@
from hestia_utils.parser import Home, HomeResults


def _build_error_fingerprint(component: str, target: dict, exc: BaseException) -> str:
raw = "|".join(
[
component,
str(target.get("id", -1)),
str(target.get("agency", "unknown")),
exc.__class__.__name__,
str(exc),
]
)
return hashlib.sha1(raw.encode("utf-8")).hexdigest()[:16]


def _build_daily_error_digest() -> str:
rows = db.get_recent_error_rollups(hours=24, limit=20)
if not rows:
return ""

message = "\n\nError digest (past 24h):"
for row in rows:
message += (
f"\n- {row['total_count']}x {row['error_class']}"
f" [{row['agency']}:{row['target_id']}]"
f" ({row['component']})"
)
short_message = str(row["message"]).replace("\n", " ").strip()
if short_message:
message += f"\n {short_message[:160]}"

return message


def _build_zero_results_digest() -> str:
rows = db.get_enabled_targets_without_recent_homes(days=7)
if not rows:
return ""

message = "\n\nEnabled targets with 0 listings in the past 7 days:"
for row in rows:
message += f"\n- {row['agency']} ({row['id']})"
return message


async def _record_target_error(target: dict, exc: BaseException) -> None:
try:
db.upsert_error_rollup(
fingerprint=_build_error_fingerprint("scrape_site", target, exc),
component="scrape_site",
agency=str(target.get("agency", "unknown")),
target_id=int(target.get("id", 0)),
error_class=exc.__class__.__name__,
message=str(exc)[:400],
sample="".join(traceback.format_exception_only(type(exc), exc)).strip()[:1000],
context={"method": target.get("method"), "queryurl": target.get("queryurl")},
)
except BaseException as db_error:
fallback_error = f"Failed to persist error rollup for target {target.get('id')}: {repr(db_error)}"
logging.error(fallback_error)
await meta.BOT.send_message(text=fallback_error, chat_id=secrets.OWN_CHAT_ID)


async def main() -> None:

# Once a day at 7pm UTC, check some stuff and send an alert if necessary
if datetime.now().hour == 19 and datetime.now().minute < 4:
# Once a day at exactly 19:00 UTC, check some stuff and send an alert if necessary
if datetime.now().hour == 19 and datetime.now().minute == 0:
message = ""
if db.get_dev_mode():
message += "\n\nDev mode is enabled"
Expand All @@ -27,6 +90,10 @@ async def main() -> None:
last_updated = db.get_donation_link_updated()
if datetime.now() - last_updated >= timedelta(days=13):
message += "\n\nDonation link expiring soon, use /setdonate"

message += _build_daily_error_digest()
message += _build_zero_results_digest()
db.cleanup_error_rollups(retention_days=30)

if message:
await meta.BOT.send_message(text=message[2:], chat_id=secrets.OWN_CHAT_ID)
Expand Down Expand Up @@ -65,8 +132,7 @@ async def main() -> None:
except BaseException as e:
error = f"[{target['agency']} ({target['id']})] {repr(e)}"
logging.error(error)
if "Connection reset by peer" not in error:
await meta.BOT.send_message(text=error, chat_id=secrets.OWN_CHAT_ID)
await _record_target_error(target, e)
scrape_duration = datetime.now() - scrape_start_ts
logging.warning(f"Scrape took {scrape_duration.total_seconds()} seconds")
else:
Expand Down
24 changes: 24 additions & 0 deletions misc/hestia.ddl
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,27 @@ CREATE TABLE hestia.targets (
headers json DEFAULT '{}'::json NOT NULL,
enabled bool DEFAULT false NOT NULL
);


-- hestia.error_rollups definition

-- Drop table

-- DROP TABLE hestia.error_rollups;

CREATE TABLE hestia.error_rollups (
day date NOT NULL,
fingerprint varchar(64) NOT NULL,
component varchar NOT NULL,
agency varchar NOT NULL,
target_id int4 DEFAULT 0 NOT NULL,
error_class varchar NOT NULL,
message text NOT NULL,
sample text NULL,
context jsonb DEFAULT '{}'::jsonb NOT NULL,
count int4 DEFAULT 1 NOT NULL,
first_seen timestamptz DEFAULT CURRENT_TIMESTAMP NOT NULL,
last_seen timestamptz DEFAULT CURRENT_TIMESTAMP NOT NULL,
CONSTRAINT error_rollups_pkey PRIMARY KEY (day, fingerprint)
);
CREATE INDEX error_rollups_last_seen_idx ON hestia.error_rollups USING btree (last_seen);
20 changes: 20 additions & 0 deletions misc/sql/20260216_add_error_rollups.sql.enc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"data": "ENC[AES256_GCM,data:+6rbBUDXhP11vdNNqJXk67dVTe7EPvZ+CceyQ+yomgnTxXa+Xf6yngJr1Jh215ZWVKtVp7ZSccD22OHozX2QscB3zLQPtNQ1ovSZ+MpBc4vKYpw5apQ93PVAWxZgzfhKvtE0OAuLdaXEZhPJWzwGWdqgULxzKqchvownUYghpdCilfX32aTEIXqBECkBWOyzx7yNlfq4vtq6NAw6/e/FdS19ScNO17bFZnwpPwDl8QQAXzCVWyr9tBh510IChLPjZDiloH+Lbz0g3go41XOfJNM8HRUMMsdMf2pTarLf8gwYS8uUqJd3AN8P3mel+dztazy2KK88HOEpE+iFyLy1Xr/u4qh2VTQp5UdAZU1v32lSs6gjo48WASdxGCG8WQjX4B9XZwIjp9x6KOnePr+SBSasEvDyxZnivDOiNHdcTeqOFxRd7+kJCvCkypPhGfUwoD4T6kG5oFXikzKmvaGt0ld3oV269Uwz+EGu1+W/zGIS/sa8pMM7s3G4Z6vwr4QgQnX3NzPgY5VNU6/1JcD1uaMi9wGJjBrv64wYjAtUoGOToN2nAW+JIfGFGdpmuCHcrEEgJA5JzgZ/SI3fFCMoUOO/kY8HioP3u7MTkygydmPSo6nPho4sBte8sJrzcpdy5+nXGnc0bigEpILSN+NQdW6xJ9c4sbznP6lGlDfLYVu2+WM96hUa8B2EyNGVhdqreEgkyw6Vv5gfZWT+0HSpuw3U5ek0RgAA4pruKkZRaeZ2V/vitgv5anrWUzvnVKGYl7MiJeOsOFIn6hhcCMk+DIytTcF9HQGPArUs8J42fOYREUyx+E+fsqV8h2//XLobSsn95QfOz7j3rc3teT/nHJKGhIGKDArR9RQrN4dRMQee+1HR44ULXxtQPFGc/y30vHDZK9cP3aY4S1lqqnYYOkAjNxdXUeLMB/evPhB3NGxCk0rnHz+NHCBw8JSnuJO0ixR7KNGDWoypowtbLhzHlg1OqI9Ga4kPETMDbKpkCxwuCmNTsdYPcTKpOXoNJgTK,iv:VkVsSIV6To1jnwa6rI3hw6L1HQj/zNxkXfOy+CKsYPw=,tag:W2LNGD+LDsITHVIQjTE95A==,type:str]",
"sops": {
"kms": null,
"gcp_kms": null,
"azure_kv": null,
"hc_vault": null,
"age": [
{
"recipient": "age1euw2jq4h4m88g377mj39fqpxzsg6hn8gyq72adgzrdgsy9jtj3yqrhdvk5",
"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBjeFdIa3dEbXIxcU1UdFF1\nNnM1dkQ2UU1GYktjT0VZdjRDSFBxcHJ0T1dBClBkTXFTWldYM1NZNGszdlYvNkU1\ndlpmQ1hDUFlHN0x1ME5CME5WcUlrNzQKLS0tIFhqaWlNWUJLQnVrbW9ReXZYdG81\nRis4ZmNEZmpiWHBnOFFsQXJ0U0dNdHcKo384WqtL7cy9rOwjWeHwxdCDqkiNR58G\nmtHzQpoGKpD0Ml6pRioyWkJQfJHqfSZzU0D6zfwzow+4H3zDZaokjQ==\n-----END AGE ENCRYPTED FILE-----\n"
}
],
"lastmodified": "2026-02-16T21:36:57Z",
"mac": "ENC[AES256_GCM,data:78+aZERkO4802UAZWZaWyrkRxMsmLpbhLxn0scb3Ui5rNSlT2A8Rl4d4WEwsMiTC7gz4VLnI5RoMNiLzsiPr3QP8w+ZNH/WlmMp3aS4IRbAeBLsE3ewLwpOpcW+9QXCRe2T+t+FaQtP3NkuTAAXQXYRcUfPf+TKUGXR6477+iwQ=,iv:vnb+XjmGb7ZgpU/0yijt6VCnw43TYzBabtkiNXfatkc=,tag:HdThJftiehkNwgumPdAGFQ==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.9.4"
}
}