-
Notifications
You must be signed in to change notification settings - Fork 0
refactor: simplifies claim fetch logic #31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| #!/usr/bin/env python3 | ||
| """Helper functions for scripts""" | ||
|
|
||
| import json | ||
| import demjson3 | ||
| import os | ||
| import re | ||
| import sys | ||
| import requests | ||
| import yaml | ||
| from typing import Tuple | ||
| from urllib.request import Request, urlopen | ||
|
|
||
| def get_text_from_url(url: str) -> str: | ||
| with urlopen(url, timeout=60) as r: | ||
| return r.read().decode("utf-8") | ||
|
semmet95 marked this conversation as resolved.
|
||
|
|
||
| def get_text_from_file(path: str) -> str: | ||
| with open(path, 'r', encoding='utf-8') as f: | ||
| return f.read() | ||
|
|
||
| def get_oapi_spec(): | ||
| base_dir = os.path.dirname(os.path.dirname(__file__)) | ||
| oapi_path = os.path.join(base_dir, "oapi.yaml") | ||
|
|
||
| with open(oapi_path, 'r', encoding='utf-8') as f: | ||
| return yaml.safe_load(f) | ||
|
|
||
| def get_sources(base_url: str, api_key: str): | ||
| endpoint = f"{base_url}/api/v1/sources" | ||
| headers = {"X-API-Key": api_key} | ||
| response = requests.get(endpoint, headers=headers, timeout=90) | ||
| if response.status_code != 200: | ||
| print(f"Error: failed to get all sources: {response.status_code}") | ||
| return None | ||
|
Comment on lines
+29
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: Yes, requests.get can raise various exceptions, including those for timeouts and connection errors [1][2]. All exceptions explicitly raised by the requests library inherit from requests.exceptions.RequestException [1][2]. Consequently, catching requests.RequestException is the recommended way to handle all errors originating from the requests library in a single block [3]. Common exceptions you may encounter include: requests.exceptions.Timeout: Raised when a request times out [1][2]. This can be further categorized into ConnectTimeout and ReadTimeout [4][5]. requests.exceptions.ConnectionError: Raised in the event of a network problem, such as a DNS failure or a refused connection [1][2]. requests.exceptions.HTTPError: Raised if a request returns an unsuccessful HTTP status code, specifically when using the Response.raise_for_status method [1][2]. requests.exceptions.TooManyRedirects: Raised if a request exceeds the configured number of maximum redirections [1][2]. While catching RequestException is useful for broad error handling, it is common practice to catch specific exceptions first to implement tailored logic, such as retrying a request after a timeout or logging a specific connection failure [6][3]. Citations:
🏁 Script executed: # First, check if the file exists and inspect the relevant lines
if [ -f "scripts/helper.py" ]; then
echo "=== File exists, checking lines 29-35 ==="
sed -n '29,35p' scripts/helper.py
echo ""
echo "=== Full function context (around line 29) ==="
sed -n '25,45p' scripts/helper.py
else
echo "File not found at scripts/helper.py"
echo "Searching for helper.py in the repo..."
find . -name "helper.py" -type f 2>/dev/null
fiRepository: SatyaLens/sources Length of output: 1291 🏁 Script executed: # Check the imports at the top of helper.py
echo "=== Imports at top of file ==="
head -20 scripts/helper.py
echo ""
echo "=== All requests library calls in helper.py ==="
grep -n "requests\." scripts/helper.pyRepository: SatyaLens/sources Length of output: 663 Add exception handling for transport failures in Line 32 can raise Suggested fix def get_sources(base_url: str, api_key: str):
endpoint = f"{base_url}/api/v1/sources"
headers = {"X-API-Key": api_key}
- response = requests.get(endpoint, headers=headers, timeout=90)
+ try:
+ response = requests.get(endpoint, headers=headers, timeout=90)
+ except requests.RequestException as e:
+ print(f"Error: failed to get all sources: {e}", file=sys.stderr)
+ return None
if response.status_code != 200:
- print(f"Error: failed to get all sources: {response.status_code}")
+ print(f"Error: failed to get all sources: {response.status_code}", file=sys.stderr)
return None
return response.json()🤖 Prompt for AI Agents |
||
| return response.json() | ||
|
|
||
| def cleanup_json_str(json_str: str) -> str: | ||
| # models often return the json string wrapped in a code block or with incompatible values | ||
| match = re.search(r'```(?:json)?\s*(.*?)\s*```', json_str, re.DOTALL) | ||
| if match: | ||
| json_str = match.group(1) | ||
|
|
||
| # Replace all Python boolean and None values | ||
| json_str = re.sub(r':\s*None\b', ': null', json_str) | ||
| json_str = re.sub(r':\s*False\b', ': false', json_str) | ||
| json_str = re.sub(r':\s*True\b', ': true', json_str) | ||
|
|
||
| json_str = json_str.replace("`", "'") | ||
| json_str = json_str.strip().replace("'", "\'") | ||
| json_str = json.dumps(demjson3.decode(json_str)) | ||
|
|
||
| return json_str | ||
|
|
||
| def clean_filepath(path: str, replace: str = "_") -> str: | ||
| cleaned = re.sub(r'[^a-zA-Z0-9_-]', replace, path) | ||
| cleaned = re.sub(f'{re.escape(replace)}+', replace, cleaned) | ||
| cleaned = cleaned.strip(replace) | ||
| return cleaned | ||
|
|
||
| def post_request(endpoint: str, headers: dict, payload: dict, timeout: int) -> Tuple[int, str]: | ||
| data = json.dumps(payload).encode("utf-8") | ||
| req = Request( | ||
| url=endpoint, | ||
| data=data, | ||
| headers=headers | ||
| ) | ||
|
|
||
| try: | ||
| with urlopen(req, timeout=timeout) as r: | ||
| status = r.getcode() | ||
| body = r.read().decode("utf-8") | ||
| except Exception as e: | ||
| print(f"Error making request to {endpoint}: {e}", file=sys.stderr) | ||
| return 0, "" | ||
|
|
||
| return status, body | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| #!/usr/bin/env python3 | ||
|
|
||
| import glob | ||
| import json | ||
| import os | ||
| import sys | ||
| import yaml | ||
|
|
||
| import helper | ||
| import newsdata_io | ||
| import openrouter | ||
|
|
||
| API_BASE_URL = os.environ["API_BASE_URL"] | ||
| API_KEY = os.environ["API_KEY"] | ||
|
|
||
| FALSIFIABLE_CLAIM_SKILL_URL = os.getenv( | ||
| "FALSIFIABLE_CLAIM_SKILL_URL", | ||
| "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" | ||
| ) | ||
| CLAIM_FILTER_PROMPT = os.getenv( | ||
| "CLAIM_FILTER_PROMPT", | ||
| ( | ||
| "Use web search tool to visit the link for each article, access the content and then assess if it is a falsifiable claim." | ||
| "Out of these 10 articles, only return 1 article that best fits the falsifiable claim criterion." | ||
| "Prefer claims that have been made by the news source directly" | ||
| "Keep the json structure of the claims the same as the original schema in the input. Do not add remove, or modify any key or value in the json string." | ||
| "Only output the plain json array string that I can safely unmarshal." | ||
| "Do not format the string. Do not output anything else." | ||
| ) | ||
| ) | ||
|
|
||
| def update_claim_fields(srcDigest: str, claim): | ||
| claim["sourceUriDigest"] = srcDigest | ||
| claim["summary"] = claim["description"] | ||
| claim["uri"] = claim["link"] | ||
|
Comment on lines
+32
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guard required keys before mutating claim objects. Line 34 and Line 35 assume Proposed fix def update_claim_fields(srcDigest: str, claim):
+ if not isinstance(claim, dict):
+ return False
+ if "description" not in claim or "link" not in claim:
+ return False
claim["sourceUriDigest"] = srcDigest
claim["summary"] = claim["description"]
claim["uri"] = claim["link"]
+ return True
@@
for claim in new_claims:
- update_claim_fields(srcUriDigest, claim)
+ if not update_claim_fields(srcUriDigest, claim):
+ continue
new_claim = TrueAlso applies to: 57-59 🤖 Prompt for AI Agents |
||
|
|
||
| def get_claim_docs(claims_dir: str): | ||
| # Find all YAML files in claims directory and subdirectories | ||
| yaml_files = glob.glob(os.path.join(claims_dir, "**", "*.yaml"), recursive=True) | ||
| yaml_files.extend(glob.glob(os.path.join(claims_dir, "**", "*.yml"), recursive=True)) | ||
|
|
||
| claims_array = [] | ||
| for yaml_file in yaml_files: | ||
| try: | ||
| with open(yaml_file, 'r', encoding='utf-8') as f: | ||
| claim_data = yaml.safe_load(f) | ||
|
semmet95 marked this conversation as resolved.
|
||
| if claim_data: # Only add if file is not empty | ||
| claims_array.append(claim_data) | ||
| except Exception as e: | ||
| print(f"Error loading YAML file {yaml_file}: {e}", file=sys.stderr) | ||
|
|
||
| return claims_array | ||
|
|
||
| def get_new_claims(all_claim_docs, new_claims, srcUriDigest): | ||
| unique_claims = [] | ||
|
|
||
| for claim in new_claims: | ||
| update_claim_fields(srcUriDigest, claim) | ||
| new_claim = True | ||
| for claim_doc in all_claim_docs: | ||
| if claim["uri"] == claim_doc["uri"] or claim["title"] == claim_doc["title"]: | ||
|
semmet95 marked this conversation as resolved.
|
||
| new_claim = False | ||
| break | ||
|
semmet95 marked this conversation as resolved.
|
||
| if new_claim: | ||
| unique_claims.append(claim) | ||
|
|
||
| return unique_claims | ||
|
|
||
| def create_claim_docs(claims: list, srcName: str): | ||
| claim_input_schema = helper.get_oapi_spec()['components']['schemas']['ClaimInput'] | ||
| claim_example = claim_input_schema.get('example') | ||
|
|
||
| # Custom representer to force double quotes around strings | ||
| def quoted_str_representer(dumper, data): | ||
| return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"') | ||
|
|
||
| yaml.add_representer(str, quoted_str_representer) | ||
|
|
||
| for claim in claims: | ||
| claim_doc = {} | ||
| # keep only relevant fields in the claims | ||
| for key in claim_example.keys(): | ||
| claim_doc[key] = str(claim[key]) | ||
|
semmet95 marked this conversation as resolved.
semmet95 marked this conversation as resolved.
|
||
|
|
||
| filename = claim_doc["title"].lower() | ||
| if len(filename) > 30: | ||
| filename = filename[:30] | ||
| filename = helper.clean_filepath(filename) | ||
| filename = f"{filename}.yaml" | ||
|
|
||
| dirname = srcName.lower() | ||
| if len(dirname) > 30: | ||
| dirname = dirname[:30] | ||
| dirname = helper.clean_filepath(dirname) | ||
|
|
||
| # Create file path | ||
| file_path = os.path.join("claims", dirname, filename) | ||
|
|
||
| # avoid overwriting existing files | ||
| if os.path.exists(file_path): | ||
| print(f"Warning: claim file with name : {file_path} already exists", file=sys.stderr) | ||
| continue | ||
|
|
||
| # Write claim_doc to YAML file | ||
| os.makedirs(os.path.dirname(file_path), exist_ok=True) | ||
| with open(file_path, 'w', encoding='utf-8') as f: | ||
|
semmet95 marked this conversation as resolved.
|
||
| yaml.dump(claim_doc, f, default_flow_style=False, allow_unicode=True, width=float('inf')) | ||
|
|
||
| print(f"Created claim document: {file_path}") | ||
|
|
||
| # TODO: maintain a list of sources for which domainUrlNewsData fields needs to be updated | ||
| def main(): | ||
| claims_dir = os.path.join(os.path.dirname(__file__), "..", "claims") | ||
| claims_dir = os.path.abspath(claims_dir) | ||
| all_claim_docs = get_claim_docs(claims_dir) | ||
|
|
||
| sources = helper.get_sources(API_BASE_URL, API_KEY) | ||
| if sources is None: | ||
| print(f"Error: failed to fetch all sources", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| # Fetch falsifiable claim skill | ||
| try: | ||
| falsifiable_claim_skill = helper.get_text_from_url(FALSIFIABLE_CLAIM_SKILL_URL) | ||
| except Exception as e: | ||
| print(f"Error: failed to fetch skill from {FALSIFIABLE_CLAIM_SKILL_URL}: {e}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| for source in sources: | ||
| domain_url = source["uri"] | ||
| if source["domainUrlNewsData"] != "": | ||
| domain_url = source["domainUrlNewsData"] | ||
|
|
||
| claims = newsdata_io.get_claims(domain_url) | ||
| if claims is None or len(claims) == 0: | ||
| continue | ||
|
|
||
| # keep only those articles that can be classified as falsifiable claims | ||
| req_content = ( | ||
| "Following is a list of 10 articles published by the same news outlet. Each article is represented by a json string type element in the array" | ||
| f"\n\n{claims}\n\n" | ||
| f"{CLAIM_FILTER_PROMPT}" | ||
| ) | ||
| filtered_claims = openrouter.req_w_addons(req_content, skill=falsifiable_claim_skill, tools=[openrouter.WEB_SEARCH_TOOL]) | ||
| if filtered_claims == "": | ||
| print(f"Error: failed to filter claims for source {source['name']}", file=sys.stderr) | ||
| continue | ||
|
|
||
| try: | ||
| filtered_claims_list = json.loads(helper.cleanup_json_str(filtered_claims)) | ||
| except Exception as e: | ||
| print(f"Error: failed to cleanup and unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr) | ||
| continue | ||
|
|
||
| if len(filtered_claims_list) == 0: | ||
|
Comment on lines
+149
to
+155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Validate parsed JSON shape before downstream processing. After Line 150, code assumes a list of dict claims. A valid non-list JSON payload (or mixed list) can trigger runtime failures later in Proposed fix try:
filtered_claims_list = json.loads(helper.cleanup_json_str(filtered_claims))
except Exception as e:
print(f"Error: failed to cleanup and unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr)
continue
+ if not isinstance(filtered_claims_list, list):
+ print(f"Error: expected claims array for {source['name']}", file=sys.stderr)
+ continue
+ filtered_claims_list = [c for c in filtered_claims_list if isinstance(c, dict)]
+
if len(filtered_claims_list) == 0:
print(f"Error: no claims found for {source['name']} in: {filtered_claims}", file=sys.stderr)
continue🧰 Tools🪛 Ruff (0.15.12)[warning] 151-151: Do not catch blind exception: (BLE001) 🤖 Prompt for AI Agents |
||
| print(f"Error: no claims found for {source['name']} in: {filtered_claims}", file=sys.stderr) | ||
| continue | ||
|
|
||
| # list of new claims to be ingested | ||
| new_unique_claims = get_new_claims(all_claim_docs, filtered_claims_list, source["uriDigest"]) | ||
| create_claim_docs(new_unique_claims, source["name"]) | ||
|
semmet95 marked this conversation as resolved.
Comment on lines
+159
to
+161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deduplication baseline is stale across sources in the same run.
Proposed fix new_unique_claims = get_new_claims(all_claim_docs, filtered_claims_list, source["uriDigest"])
create_claim_docs(new_unique_claims, source["name"])
+ all_claim_docs.extend(new_unique_claims)🤖 Prompt for AI Agents |
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
Uh oh!
There was an error while loading. Please reload this page.