-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
Summary
Add methods to validate schemas against their test URLs and track health status.
Design
SchemaHealth Dataclass
# fetcharoo/schemas/health.py
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Literal
@dataclass
class SchemaHealth:
"""Result of schema validation."""
schema_name: str
status: Literal['healthy', 'degraded', 'broken']
last_validated: datetime
test_url: str
expected_pdfs: int
found_pdfs: int
error: Optional[str] = None
@property
def is_healthy(self) -> bool:
return self.status == 'healthy'Validation Method
# In SiteSchema class
class SiteSchema:
# ... existing fields ...
def validate(self, timeout: int = 30) -> SchemaHealth:
"""
Test if schema still works against its test URL.
Returns:
SchemaHealth with status and details
"""
if not self.test_url:
return SchemaHealth(
schema_name=self.name,
status='broken',
last_validated=datetime.now(),
test_url='',
expected_pdfs=self.expected_min_pdfs,
found_pdfs=0,
error='No test_url configured'
)
try:
pdfs = find_pdfs_from_webpage(
self.test_url,
recursion_depth=self.recommended_depth,
timeout=timeout,
deduplicate=True
)
found = len(pdfs)
if found >= self.expected_min_pdfs:
status = 'healthy'
elif found > 0:
status = 'degraded'
else:
status = 'broken'
return SchemaHealth(
schema_name=self.name,
status=status,
last_validated=datetime.now(),
test_url=self.test_url,
expected_pdfs=self.expected_min_pdfs,
found_pdfs=found
)
except Exception as e:
return SchemaHealth(
schema_name=self.name,
status='broken',
last_validated=datetime.now(),
test_url=self.test_url,
expected_pdfs=self.expected_min_pdfs,
found_pdfs=0,
error=str(e)
)Registry Validation
# fetcharoo/schemas/registry.py
def validate_all_schemas(timeout: int = 30) -> Dict[str, SchemaHealth]:
"""Validate all registered schemas."""
results = {}
for name, schema in _SCHEMAS.items():
results[name] = schema.validate(timeout=timeout)
return results
def get_healthy_schemas() -> List[SiteSchema]:
"""Get only schemas that are currently healthy."""
# Could cache health results
...Usage
from fetcharoo.schemas import get_schema, validate_all_schemas
# Validate single schema
schema = get_schema('springer_book')
health = schema.validate()
print(f"{schema.name}: {health.status}")
# Validate all
results = validate_all_schemas()
for name, health in results.items():
print(f"{name}: {health.status} ({health.found_pdfs}/{health.expected_pdfs} PDFs)")Tasks
- Create
SchemaHealthdataclass - Add
validate()method toSiteSchema - Add
validate_all_schemas()to registry - Handle schemas without
test_url - Add timeout parameter for validation
- Export health types from
fetcharoo.schemas - Add unit tests (with mocked responses)
Acceptance Criteria
schema.validate()returns accurate health status- Returns 'healthy' when >= expected PDFs found
- Returns 'degraded' when some but fewer PDFs found
- Returns 'broken' on errors or 0 PDFs
validate_all_schemas()tests all registered schemas
Dependencies
- Create SiteSchema base dataclass #11 (SiteSchema base class)
- Implement schema registry with auto-detection #12 (Schema registry)
- Add built-in schemas for common sites (Springer, arXiv) #13 (Built-in schemas with test URLs)
Part of
Parent issue: #10
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels