From ef2336b5eaecb51daf83f9ded9e018556412dcfc Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Wed, 13 May 2026 19:10:17 +0530 Subject: [PATCH 01/11] feat: adds script to fetch claims for added sources Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/newsdata_io.py diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py new file mode 100644 index 0000000..06b40ae --- /dev/null +++ b/scripts/newsdata_io.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import os +import sys +import requests + +# Query parameters constants +CATEGORY = "environment,technology,world" +LANGUAGE = "en" +REMOVE_DUPLICATE = "1" +SIZE = "10" +DATATYPE = "news,research,analysis,pressRelease" + +def get_sources(base_url: str, api_key: str): + endpoint = f"{base_url}/api/v1/sources" + headers = {"X-API-Key": api_key} + response = requests.get(endpoint, headers=headers, timeout=90) + if response.status_code != 200: + print(f"Error: Received status code {response.status_code}") + exit(1) + return response.json() + +def get_claims(base_url: str, api_key: str, src_domain_url: str): + endpoint = f"{base_url}/latest" + params = { + "category": CATEGORY, + "language": LANGUAGE, + "removeduplicate": REMOVE_DUPLICATE, + "size": SIZE, + "datatype": DATATYPE, + "apikey": api_key, + "domainurl": src_domain_url + } + response = requests.get(endpoint, params=params, timeout=10) + if response.status_code != 200: + print(f"Error: Received status code {response.status_code}") + exit(1) + return response.json() + +def main(): + base_url = os.environ["API_BASE_URL"] + api_key = os.environ["API_KEY"] + news_data_base_url = os.environ["NEWSDATA_API_BASE_URL"] + news_data_api_key = os.environ["NEWSDATA_API_KEY"] + + sources = get_sources(base_url, api_key) + src_to_patch = set() + for source in sources: + domain_url = source["uri"] + if source["domainUrlNewsData"] != "": + domain_url = source["domainUrlNewsData"] + else: + src_to_patch.add(source["uriDigest"]) + + claims = get_claims(news_data_base_url, news_data_api_key, domain_url) + print(claims) + break + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file From c90651d208a36be24bf33b8aaef85ae5221b3b99 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Thu, 14 May 2026 22:29:49 +0530 Subject: [PATCH 02/11] feat: updates filtered claims fields to make it compatible with claim model Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 142 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 140 insertions(+), 2 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index 06b40ae..92febbd 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 +import json import os +import re import sys +from typing import Tuple +from urllib.request import Request, urlopen import requests # Query parameters constants @@ -11,6 +15,18 @@ SIZE = "10" DATATYPE = "news,research,analysis,pressRelease" +FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" +CLAIM_PER_SOURCE = 2 +FREE_MODELS_DOC = [ + "google/gemma-4-31b-it:free", + "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", + "google/gemma-4-26b-a4b-it:free", +] + +def fetch_web_text(url: str) -> str: + with urlopen(url, timeout=60) as r: + return r.read().decode("utf-8") + def get_sources(base_url: str, api_key: str): endpoint = f"{base_url}/api/v1/sources" headers = {"X-API-Key": api_key} @@ -20,6 +36,94 @@ def get_sources(base_url: str, api_key: str): exit(1) return response.json() +def post_openrouter(base_url: str, api_key: str, payload: dict) -> Tuple[int, str]: + data = json.dumps(payload).encode("utf-8") + req = Request( + f"{base_url}/chat/completions", + data=data, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + ) + + status = -1 + try: + with urlopen(req, timeout=60) as r: + status = r.getcode() + body = r.read().decode("utf-8") + except Exception as e: + print(f"Error making request to OpenRouter API: {e}", file=sys.stderr) + body = "" + + return status, body + +def req_openrouter(base_url: str, api_key: str, payload: dict) -> str: + status, body = post_openrouter(base_url, api_key, payload) + + if status == 200: + try: + data = json.loads(body) + except Exception as e: + print(f"Failed to parse JSON response: {e}", file=sys.stderr) + print(body) + sys.exit(1) + + # Extract assistant reply + reply = None + try: + reply = data["choices"][0]["message"]["content"] + return reply + except Exception as e: + print(f"Failed to access key [choices][0][message][content]: {e}", file=sys.stderr) + sys.exit(1) + else: + print(f"Openrouter response status: {status}", file=sys.stderr) + return "" + +def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, claims): + filter_prompt = ( + "Following is a list of 10 articles published by the same news outlet. Each article is represented by a json string type element in the array\n\n" + f"{claims}" + "\n\nUse web search tool to visit the link for each article, access the content and then assess if it is a falsifiable claim." + "\nOut of these 10 articles, only return the 2 article that best fit the falsifiable claim criterion." + "Prefer claims that have been made by the news source directly" + "Keep the json structure of the claims the same as the input. Do not add or remove any field." + "Only output the plain json array string that I can directly load into a Python dict." + "Do not format the string. Do not output anything else." + ) + + filtered_claims = "" + for model in FREE_MODELS_DOC: + payload = { + "model": model, + "messages": [ + {"role": "system", "content": falsifiable_claim_skill}, + { + "role": "user", + "content": filter_prompt + }, + ], + "tools": [ + {"type": "openrouter:web_search"} + ] + } + + filtered_claims = req_openrouter(base_url, api_key, payload) + if filtered_claims != "": + break + + if filtered_claims == "": + print("Error: All models failed.", file=sys.stderr) + sys.exit(1) + + # models often return the json string wrapped in a code block + match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) + if match: + return json.loads(match.group(1)) + + return json.loads(filtered_claims) + def get_claims(base_url: str, api_key: str, src_domain_url: str): endpoint = f"{base_url}/latest" params = { @@ -35,15 +139,41 @@ def get_claims(base_url: str, api_key: str, src_domain_url: str): if response.status_code != 200: print(f"Error: Received status code {response.status_code}") exit(1) - return response.json() + return response.json()["results"] + + # filtered_claims = [] + # for claim in claims: + # filtered_claims.append({ + # "uri": claim["claim"], + # "claimDate": claim["claimDate"], + # "claimReviewDate": claim["claimReviewDate"], + # "claimReviewRating": claim["claimReviewRating"], + # "claimReviewUrl": claim["claimReviewUrl"], + # "claimReviewPublisher": claim["claimReviewPublisher"], + # }) + +def update_claim_fields(srcDigest: str, claim): + claim["sourceUriDigest"] = srcDigest + claim["summary"] = claim["description"] + claim["uri"] = claim["link"] def main(): base_url = os.environ["API_BASE_URL"] api_key = os.environ["API_KEY"] news_data_base_url = os.environ["NEWSDATA_API_BASE_URL"] news_data_api_key = os.environ["NEWSDATA_API_KEY"] + openrouter_api_key = os.environ["OPENROUTER_API_KEY"] + openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] sources = get_sources(base_url, api_key) + + # Fetch falsifiable claim skill + try: + falsifiable_claim_skill = fetch_web_text(FALSIFIABLE_CLAIM_SKILL_URL) + except Exception as e: + print(f"Error: failed to fetch skill from {FALSIFIABLE_CLAIM_SKILL_URL}: {e}", file=sys.stderr) + sys.exit(1) + src_to_patch = set() for source in sources: domain_url = source["uri"] @@ -53,7 +183,15 @@ def main(): src_to_patch.add(source["uriDigest"]) claims = get_claims(news_data_base_url, news_data_api_key, domain_url) - print(claims) + + # keep only those articles that can be classified as falsifiable claims + filtered_claims = filter_claims(openrouter_base_url, openrouter_api_key, falsifiable_claim_skill, claims) + + # keep only relevant fields in the claims + for claim in filtered_claims: + update_claim_fields(source["uriDigest"], claim) + + print(filtered_claims) break if __name__ == "__main__": From ee69dc11705beb18b53c912145894c898b157fa1 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Thu, 14 May 2026 22:58:58 +0530 Subject: [PATCH 03/11] feat: adds logic to load all the claim docs in the repo Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index 92febbd..a0e099e 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import glob import json import os import re @@ -7,6 +8,7 @@ from typing import Tuple from urllib.request import Request, urlopen import requests +import yaml # Query parameters constants CATEGORY = "environment,technology,world" @@ -157,6 +159,26 @@ def update_claim_fields(srcDigest: str, claim): claim["summary"] = claim["description"] claim["uri"] = claim["link"] +def get_claim_docs(): + claims_dir = os.path.join(os.path.dirname(__file__), "..", "claims") + claims_dir = os.path.abspath(claims_dir) + + # Find all YAML files in claims directory and subdirectories + yaml_files = glob.glob(os.path.join(claims_dir, "**", "*.yaml"), recursive=True) + yaml_files.extend(glob.glob(os.path.join(claims_dir, "**", "*.yml"), recursive=True)) + + claims_array = [] + for yaml_file in yaml_files: + try: + with open(yaml_file, 'r') as f: + claim_data = yaml.safe_load(f) + if claim_data: # Only add if file is not empty + claims_array.append(claim_data) + except Exception as e: + print(f"Error loading YAML file {yaml_file}: {e}", file=sys.stderr) + + return claims_array + def main(): base_url = os.environ["API_BASE_URL"] api_key = os.environ["API_KEY"] @@ -165,6 +187,7 @@ def main(): openrouter_api_key = os.environ["OPENROUTER_API_KEY"] openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] + claim_docs = get_claim_docs() sources = get_sources(base_url, api_key) # Fetch falsifiable claim skill From c19d2d34c7f06b87cbe18eb58a764bd9434829ca Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Thu, 14 May 2026 23:52:20 +0530 Subject: [PATCH 04/11] feat: adds logic to create claim yaml docs Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 68 +++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index a0e099e..39ba77a 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -26,7 +26,7 @@ ] def fetch_web_text(url: str) -> str: - with urlopen(url, timeout=60) as r: + with urlopen(url, timeout=10) as r: return r.read().decode("utf-8") def get_sources(base_url: str, api_key: str): @@ -91,7 +91,7 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla "\nOut of these 10 articles, only return the 2 article that best fit the falsifiable claim criterion." "Prefer claims that have been made by the news source directly" "Keep the json structure of the claims the same as the input. Do not add or remove any field." - "Only output the plain json array string that I can directly load into a Python dict." + "Only output the plain json array string that I can safely unmarshal." "Do not format the string. Do not output anything else." ) @@ -115,11 +115,16 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla if filtered_claims != "": break - if filtered_claims == "": + if filtered_claims == None or filtered_claims == "": print("Error: All models failed.", file=sys.stderr) sys.exit(1) - # models often return the json string wrapped in a code block + # models often return the json string wrapped in a code block or with incompatible values + # Replace all Python boolean and None values + filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) + filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) + filtered_claims = re.sub(r':\s*True\b', ': true', filtered_claims) + match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) if match: return json.loads(match.group(1)) @@ -143,17 +148,6 @@ def get_claims(base_url: str, api_key: str, src_domain_url: str): exit(1) return response.json()["results"] - # filtered_claims = [] - # for claim in claims: - # filtered_claims.append({ - # "uri": claim["claim"], - # "claimDate": claim["claimDate"], - # "claimReviewDate": claim["claimReviewDate"], - # "claimReviewRating": claim["claimReviewRating"], - # "claimReviewUrl": claim["claimReviewUrl"], - # "claimReviewPublisher": claim["claimReviewPublisher"], - # }) - def update_claim_fields(srcDigest: str, claim): claim["sourceUriDigest"] = srcDigest claim["summary"] = claim["description"] @@ -179,6 +173,42 @@ def get_claim_docs(): return claims_array +def is_claim_new(claim) -> bool: + claim_docs = get_claim_docs() + + for claim_doc in claim_docs: + if claim["uri"] == claim_doc["uri"] or claim["title"] == claim_doc["title"]: + return False + + return True + +def create_claim_docs(claims: list): + with open('oapi.yaml', 'r') as f: + oapi_spec = yaml.safe_load(f) + + claim_input_schema = oapi_spec['components']['schemas']['ClaimInput'] + claim_example = claim_input_schema.get('example') + + for claim in claims: + claim_doc = {} + for key in claim_example.keys(): + claim_doc[key] = claim[key] + + title = claim_doc["title"] + filename = title.replace(" ", "_") + if len(filename) > 30: + filename = filename[:30] + filename = f"{filename}.yaml" + + # Create file path + file_path = os.path.join(claims_dir, filename) + + # Write claim_doc to YAML file + with open(file_path, 'w') as f: + yaml.dump(claim_doc, f, default_flow_style=False) + + print(f"Created claim document: {file_path}") + def main(): base_url = os.environ["API_BASE_URL"] api_key = os.environ["API_KEY"] @@ -187,7 +217,6 @@ def main(): openrouter_api_key = os.environ["OPENROUTER_API_KEY"] openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] - claim_docs = get_claim_docs() sources = get_sources(base_url, api_key) # Fetch falsifiable claim skill @@ -210,11 +239,16 @@ def main(): # keep only those articles that can be classified as falsifiable claims filtered_claims = filter_claims(openrouter_base_url, openrouter_api_key, falsifiable_claim_skill, claims) + # list of new claims to be ingested + new_claims = [] # keep only relevant fields in the claims for claim in filtered_claims: update_claim_fields(source["uriDigest"], claim) + if is_claim_new(claim): + new_claims.append(claim) + + create_claim_docs(new_claims) - print(filtered_claims) break if __name__ == "__main__": From bbcb9a222a28329eaef432bc5983212140014c17 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Fri, 15 May 2026 11:02:41 +0530 Subject: [PATCH 05/11] fix: makes openrouter response process logic more robust Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 57 ++++++++++++++++++++++++++---------------- scripts/openrouter.py | 3 +++ 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index 39ba77a..04b2e2f 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -20,9 +20,13 @@ FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" CLAIM_PER_SOURCE = 2 FREE_MODELS_DOC = [ + "openai/gpt-oss-120b:free", "google/gemma-4-31b-it:free", "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", "google/gemma-4-26b-a4b-it:free", + "nvidia/nemotron-nano-12b-v2-vl:free", + "z-ai/glm-4.5-air:free", + "openrouter/free" ] def fetch_web_text(url: str) -> str: @@ -51,7 +55,7 @@ def post_openrouter(base_url: str, api_key: str, payload: dict) -> Tuple[int, st status = -1 try: - with urlopen(req, timeout=60) as r: + with urlopen(req, timeout=120) as r: status = r.getcode() body = r.read().decode("utf-8") except Exception as e: @@ -69,7 +73,7 @@ def req_openrouter(base_url: str, api_key: str, payload: dict) -> str: except Exception as e: print(f"Failed to parse JSON response: {e}", file=sys.stderr) print(body) - sys.exit(1) + return "" # Extract assistant reply reply = None @@ -78,7 +82,7 @@ def req_openrouter(base_url: str, api_key: str, payload: dict) -> str: return reply except Exception as e: print(f"Failed to access key [choices][0][message][content]: {e}", file=sys.stderr) - sys.exit(1) + return "" else: print(f"Openrouter response status: {status}", file=sys.stderr) return "" @@ -96,6 +100,7 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla ) filtered_claims = "" + filtered_claims_list = [] for model in FREE_MODELS_DOC: payload = { "model": model, @@ -112,24 +117,31 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla } filtered_claims = req_openrouter(base_url, api_key, payload) - if filtered_claims != "": + if filtered_claims != None and filtered_claims != "": + # models often return the json string wrapped in a code block or with incompatible values + filtered_claims = filtered_claims.replace("'", '"') + # Replace all Python boolean and None values + filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) + filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) + filtered_claims = re.sub(r':\s*True\b', ': true', filtered_claims) + + match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) + if match: + filtered_claims = match.group(1) + + try: + filtered_claims_list = json.loads(filtered_claims) + except Exception as e: + print(f"Error: failed to unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr) + continue break - if filtered_claims == None or filtered_claims == "": + if filtered_claims_list == None or len(filtered_claims_list) == 0: print("Error: All models failed.", file=sys.stderr) sys.exit(1) - # models often return the json string wrapped in a code block or with incompatible values - # Replace all Python boolean and None values - filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) - filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) - filtered_claims = re.sub(r':\s*True\b', ': true', filtered_claims) - - match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) - if match: - return json.loads(match.group(1)) - - return json.loads(filtered_claims) + + return filtered_claims_list def get_claims(base_url: str, api_key: str, src_domain_url: str): endpoint = f"{base_url}/latest" @@ -182,7 +194,7 @@ def is_claim_new(claim) -> bool: return True -def create_claim_docs(claims: list): +def create_claim_docs(claims: list, srcName: str): with open('oapi.yaml', 'r') as f: oapi_spec = yaml.safe_load(f) @@ -194,14 +206,17 @@ def create_claim_docs(claims: list): for key in claim_example.keys(): claim_doc[key] = claim[key] - title = claim_doc["title"] - filename = title.replace(" ", "_") + filename = claim_doc["title"].replace(" ", "_").lower() if len(filename) > 30: filename = filename[:30] filename = f"{filename}.yaml" + + dirname = srcName.replace(" ", "_").lower() + if len(dirname) > 30: + dirname = dirname[:30] # Create file path - file_path = os.path.join(claims_dir, filename) + file_path = os.path.join("claims", dirname, filename) # Write claim_doc to YAML file with open(file_path, 'w') as f: @@ -247,7 +262,7 @@ def main(): if is_claim_new(claim): new_claims.append(claim) - create_claim_docs(new_claims) + create_claim_docs(new_claims, source["name"]) break diff --git a/scripts/openrouter.py b/scripts/openrouter.py index 277c1e9..4d0d464 100644 --- a/scripts/openrouter.py +++ b/scripts/openrouter.py @@ -26,9 +26,12 @@ MD_PROCESSING_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/md-processing/SKILL.md" SOURCE_QUESTION = "What are the top 10 latest most popular news outlets in the world listed in this document? Only output URLs of these news outlets separated by new lines. Do not output anything else." FREE_MODELS_DOC = [ + "openai/gpt-oss-120b:free", "google/gemma-4-31b-it:free", "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", "google/gemma-4-26b-a4b-it:free", + "nvidia/nemotron-nano-12b-v2-vl:free", + "z-ai/glm-4.5-air:free" ] From e93560b8a3c31251e6f4829270794cb7b0f8efe4 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Fri, 15 May 2026 11:07:59 +0530 Subject: [PATCH 06/11] chore: adds example for claim input component Signed-off-by: Amit Singh --- oapi.yaml | 5 +++++ scripts/newsdata_io.py | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/oapi.yaml b/oapi.yaml index cebe347..f616c2d 100644 --- a/oapi.yaml +++ b/oapi.yaml @@ -613,6 +613,11 @@ components: x-oapi-codegen-extra-tags: binding: required validate: httpsurl + example: + sourceUriDigest: "8649a4126fb4fc9a750f432b729c8477398cf28ca241403b2cd36a6dc841f441" + summary: "The unemployment rate decreased by 2% in Q4 2024" + title: "Unemployment Rate Decrease" + uri: "https://www.nytimes.com/2024/unemployment-report" Claim: type: object diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index 04b2e2f..d7fe858 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -194,6 +194,12 @@ def is_claim_new(claim) -> bool: return True +def clean_filepath(path: str, replace: str = "_") -> str: + cleaned = re.sub(r'[^a-zA-Z0-9_-]', replace, path) + cleaned = re.sub(f'{re.escape(replace)}+', replace, cleaned) + cleaned = cleaned.strip(replace) + return cleaned + def create_claim_docs(claims: list, srcName: str): with open('oapi.yaml', 'r') as f: oapi_spec = yaml.safe_load(f) @@ -204,14 +210,14 @@ def create_claim_docs(claims: list, srcName: str): for claim in claims: claim_doc = {} for key in claim_example.keys(): - claim_doc[key] = claim[key] + claim_doc[key] = str(claim[key]) - filename = claim_doc["title"].replace(" ", "_").lower() + filename = clean_filepath(claim_doc["title"].lower()) if len(filename) > 30: filename = filename[:30] filename = f"{filename}.yaml" - dirname = srcName.replace(" ", "_").lower() + dirname = clean_filepath(srcName.lower()) if len(dirname) > 30: dirname = dirname[:30] @@ -219,8 +225,9 @@ def create_claim_docs(claims: list, srcName: str): file_path = os.path.join("claims", dirname, filename) # Write claim_doc to YAML file - with open(file_path, 'w') as f: - yaml.dump(claim_doc, f, default_flow_style=False) + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(claim_doc, f, default_flow_style=False, allow_unicode=True, width=float('inf')) print(f"Created claim document: {file_path}") From 814982d57a153bdfb04e144cd6925ec22db9539f Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Fri, 15 May 2026 12:33:30 +0530 Subject: [PATCH 07/11] fix: formats yaml output file Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index d7fe858..e6431cb 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -5,6 +5,7 @@ import os import re import sys +import time from typing import Tuple from urllib.request import Request, urlopen import requests @@ -20,13 +21,15 @@ FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" CLAIM_PER_SOURCE = 2 FREE_MODELS_DOC = [ - "openai/gpt-oss-120b:free", + "openai/gpt-oss-120b", + "mistralai/mistral-medium-3-5", + "moonshotai/kimi-k2-0905", "google/gemma-4-31b-it:free", + "qwen/qwen3-32b", + "google/gemini-3.1-flash-lite", + "deepseek/deepseek-v4-flash:free", "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", - "google/gemma-4-26b-a4b-it:free", - "nvidia/nemotron-nano-12b-v2-vl:free", - "z-ai/glm-4.5-air:free", - "openrouter/free" + "cohere/command-a" ] def fetch_web_text(url: str) -> str: @@ -101,6 +104,7 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla filtered_claims = "" filtered_claims_list = [] + ctr = 1 for model in FREE_MODELS_DOC: payload = { "model": model, @@ -136,6 +140,10 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla continue break + # delay before sending the request again + time.sleep(30*ctr) + ctr += 1 + if filtered_claims_list == None or len(filtered_claims_list) == 0: print("Error: All models failed.", file=sys.stderr) sys.exit(1) @@ -156,8 +164,8 @@ def get_claims(base_url: str, api_key: str, src_domain_url: str): } response = requests.get(endpoint, params=params, timeout=10) if response.status_code != 200: - print(f"Error: Received status code {response.status_code}") - exit(1) + print(f"Error: couldn't fetch claims for {src_domain_url}: {response.status_code}") + return None return response.json()["results"] def update_claim_fields(srcDigest: str, claim): @@ -207,19 +215,27 @@ def create_claim_docs(claims: list, srcName: str): claim_input_schema = oapi_spec['components']['schemas']['ClaimInput'] claim_example = claim_input_schema.get('example') + # Custom representer to force double quotes around strings + def quoted_str_representer(dumper, data): + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"') + + yaml.add_representer(str, quoted_str_representer) + for claim in claims: claim_doc = {} for key in claim_example.keys(): claim_doc[key] = str(claim[key]) - filename = clean_filepath(claim_doc["title"].lower()) + filename = claim_doc["title"].lower() if len(filename) > 30: filename = filename[:30] + filename = clean_filepath(filename) filename = f"{filename}.yaml" - dirname = clean_filepath(srcName.lower()) + dirname = srcName.lower() if len(dirname) > 30: dirname = dirname[:30] + dirname = clean_filepath(dirname) # Create file path file_path = os.path.join("claims", dirname, filename) @@ -257,6 +273,8 @@ def main(): src_to_patch.add(source["uriDigest"]) claims = get_claims(news_data_base_url, news_data_api_key, domain_url) + if claims == None: + continue # keep only those articles that can be classified as falsifiable claims filtered_claims = filter_claims(openrouter_base_url, openrouter_api_key, falsifiable_claim_skill, claims) @@ -270,8 +288,6 @@ def main(): new_claims.append(claim) create_claim_docs(new_claims, source["name"]) - - break if __name__ == "__main__": sys.exit(main()) \ No newline at end of file From acb36029eab66b4f1a17a3aa5b2d653407405969 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Sun, 17 May 2026 20:09:37 +0530 Subject: [PATCH 08/11] testing with llmrouter Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 45 +++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index e6431cb..b10708f 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import glob +import demjson3 import json import os import re @@ -20,18 +21,26 @@ FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" CLAIM_PER_SOURCE = 2 +# FREE_MODELS_DOC = [ +# "openai/gpt-oss-120b", +# "mistralai/mistral-medium-3-5", +# "moonshotai/kimi-k2-0905", +# "google/gemma-4-31b-it:free", +# "qwen/qwen3-32b", +# "google/gemini-3.1-flash-lite", +# "deepseek/deepseek-v4-flash:free", +# "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", +# "cohere/command-a" +# ] + FREE_MODELS_DOC = [ - "openai/gpt-oss-120b", - "mistralai/mistral-medium-3-5", - "moonshotai/kimi-k2-0905", - "google/gemma-4-31b-it:free", - "qwen/qwen3-32b", - "google/gemini-3.1-flash-lite", - "deepseek/deepseek-v4-flash:free", - "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", - "cohere/command-a" + "embercloud/glm-4.7-flash", + "zai/glm-4.7-flash", + "zai/glm-4.6v-flash", + "zai/glm-4.5-flash" ] + def fetch_web_text(url: str) -> str: with urlopen(url, timeout=10) as r: return r.read().decode("utf-8") @@ -115,14 +124,15 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla "content": filter_prompt }, ], - "tools": [ - {"type": "openrouter:web_search"} - ] + # "tools": [ + # # {"type": "openrouter:web_search"} + # {"type": "web_search"} + # ] } filtered_claims = req_openrouter(base_url, api_key, payload) if filtered_claims != None and filtered_claims != "": - # models often return the json string wrapped in a code block or with incompatible values + # models often return the json string wrapped in a code block or with incompatible values filtered_claims = filtered_claims.replace("'", '"') # Replace all Python boolean and None values filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) @@ -132,8 +142,9 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) if match: filtered_claims = match.group(1) - try: + filtered_claims = filtered_claims.replace("`", "'") + filtered_claims = json.dumps(demjson3.decode(filtered_claims)) filtered_claims_list = json.loads(filtered_claims) except Exception as e: print(f"Error: failed to unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr) @@ -252,8 +263,10 @@ def main(): api_key = os.environ["API_KEY"] news_data_base_url = os.environ["NEWSDATA_API_BASE_URL"] news_data_api_key = os.environ["NEWSDATA_API_KEY"] - openrouter_api_key = os.environ["OPENROUTER_API_KEY"] - openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] + # openrouter_api_key = os.environ["OPENROUTER_API_KEY"] + # openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] + openrouter_api_key = os.environ["LLM_GATEWAY_API_KEY"] + openrouter_base_url = "https://api.llmgateway.io/v1" sources = get_sources(base_url, api_key) From 112106991d486287e1d953aa62ad397d11bf6fde Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Sun, 17 May 2026 21:40:32 +0530 Subject: [PATCH 09/11] chore: removes google news source Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 38 ++++++++++++++++---------------------- sources/Google-News.yaml | 4 ---- 2 files changed, 16 insertions(+), 26 deletions(-) delete mode 100644 sources/Google-News.yaml diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index b10708f..633e2b2 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -22,22 +22,15 @@ FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" CLAIM_PER_SOURCE = 2 # FREE_MODELS_DOC = [ -# "openai/gpt-oss-120b", -# "mistralai/mistral-medium-3-5", -# "moonshotai/kimi-k2-0905", # "google/gemma-4-31b-it:free", -# "qwen/qwen3-32b", -# "google/gemini-3.1-flash-lite", # "deepseek/deepseek-v4-flash:free", # "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", -# "cohere/command-a" +# "cohere/command-a", +# "openrouter/free" # ] FREE_MODELS_DOC = [ - "embercloud/glm-4.7-flash", - "zai/glm-4.7-flash", - "zai/glm-4.6v-flash", - "zai/glm-4.5-flash" + "openrouter/free" ] @@ -124,16 +117,15 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla "content": filter_prompt }, ], - # "tools": [ - # # {"type": "openrouter:web_search"} - # {"type": "web_search"} - # ] + "tools": [ + {"type": "openrouter:web_search"} + ] } filtered_claims = req_openrouter(base_url, api_key, payload) - if filtered_claims != None and filtered_claims != "": + if filtered_claims != None and filtered_claims != "" and filtered_claims != "[]": # models often return the json string wrapped in a code block or with incompatible values - filtered_claims = filtered_claims.replace("'", '"') + filtered_claims = filtered_claims.strip().replace("'", '"') # Replace all Python boolean and None values filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) @@ -149,7 +141,9 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla except Exception as e: print(f"Error: failed to unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr) continue - break + + if filtered_claims_list != None and len(filtered_claims_list) != 0: + break # delay before sending the request again time.sleep(30*ctr) @@ -263,10 +257,10 @@ def main(): api_key = os.environ["API_KEY"] news_data_base_url = os.environ["NEWSDATA_API_BASE_URL"] news_data_api_key = os.environ["NEWSDATA_API_KEY"] - # openrouter_api_key = os.environ["OPENROUTER_API_KEY"] - # openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] - openrouter_api_key = os.environ["LLM_GATEWAY_API_KEY"] - openrouter_base_url = "https://api.llmgateway.io/v1" + openrouter_api_key = os.environ["OPENROUTER_API_KEY"] + openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] + # openrouter_api_key = os.environ["LLM_GATEWAY_API_KEY"] + # openrouter_base_url = "https://api.llmgateway.io/v1" sources = get_sources(base_url, api_key) @@ -286,7 +280,7 @@ def main(): src_to_patch.add(source["uriDigest"]) claims = get_claims(news_data_base_url, news_data_api_key, domain_url) - if claims == None: + if claims == None or len(claims) == 0: continue # keep only those articles that can be classified as falsifiable claims diff --git a/sources/Google-News.yaml b/sources/Google-News.yaml deleted file mode 100644 index 8771194..0000000 --- a/sources/Google-News.yaml +++ /dev/null @@ -1,4 +0,0 @@ -name: "Google News" -summary: "Google News is a news aggregator service that organizes and provides news from various publishers worldwide." -tags: "global" -uri: "https://news.google.com" From bd4fad0bdd99d253ef39782271c01b0e1cf1567e Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Sun, 17 May 2026 22:43:49 +0530 Subject: [PATCH 10/11] chore: updates free model list Signed-off-by: Amit Singh --- scripts/newsdata_io.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py index 633e2b2..0070f47 100644 --- a/scripts/newsdata_io.py +++ b/scripts/newsdata_io.py @@ -21,15 +21,9 @@ FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" CLAIM_PER_SOURCE = 2 -# FREE_MODELS_DOC = [ -# "google/gemma-4-31b-it:free", -# "deepseek/deepseek-v4-flash:free", -# "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", -# "cohere/command-a", -# "openrouter/free" -# ] - FREE_MODELS_DOC = [ + "google/gemma-4-31b-it:free", + "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", "openrouter/free" ] @@ -60,7 +54,7 @@ def post_openrouter(base_url: str, api_key: str, payload: dict) -> Tuple[int, st status = -1 try: - with urlopen(req, timeout=120) as r: + with urlopen(req, timeout=180) as r: status = r.getcode() body = r.read().decode("utf-8") except Exception as e: @@ -97,9 +91,9 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla "Following is a list of 10 articles published by the same news outlet. Each article is represented by a json string type element in the array\n\n" f"{claims}" "\n\nUse web search tool to visit the link for each article, access the content and then assess if it is a falsifiable claim." - "\nOut of these 10 articles, only return the 2 article that best fit the falsifiable claim criterion." + "\nOut of these 10 articles, only return 1 article that best fits the falsifiable claim criterion." "Prefer claims that have been made by the news source directly" - "Keep the json structure of the claims the same as the input. Do not add or remove any field." + "Keep the json structure of the claims the same as the original schema in the input. Do not add remove, or modify any key or value in the json string." "Only output the plain json array string that I can safely unmarshal." "Do not format the string. Do not output anything else." ) @@ -124,18 +118,18 @@ def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, cla filtered_claims = req_openrouter(base_url, api_key, payload) if filtered_claims != None and filtered_claims != "" and filtered_claims != "[]": - # models often return the json string wrapped in a code block or with incompatible values - filtered_claims = filtered_claims.strip().replace("'", '"') # Replace all Python boolean and None values filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) filtered_claims = re.sub(r':\s*True\b', ': true', filtered_claims) + # models often return the json string wrapped in a code block or with incompatible values match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) if match: filtered_claims = match.group(1) try: filtered_claims = filtered_claims.replace("`", "'") + filtered_claims = filtered_claims.strip().replace("'", "\'") filtered_claims = json.dumps(demjson3.decode(filtered_claims)) filtered_claims_list = json.loads(filtered_claims) except Exception as e: @@ -259,8 +253,6 @@ def main(): news_data_api_key = os.environ["NEWSDATA_API_KEY"] openrouter_api_key = os.environ["OPENROUTER_API_KEY"] openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] - # openrouter_api_key = os.environ["LLM_GATEWAY_API_KEY"] - # openrouter_base_url = "https://api.llmgateway.io/v1" sources = get_sources(base_url, api_key) @@ -283,6 +275,8 @@ def main(): if claims == None or len(claims) == 0: continue + # add gaps between openrouter api requests + time.sleep(30) # keep only those articles that can be classified as falsifiable claims filtered_claims = filter_claims(openrouter_base_url, openrouter_api_key, falsifiable_claim_skill, claims) From 42b8328202b4a6b2db9106cedba1eec20a6147d2 Mon Sep 17 00:00:00 2001 From: Amit Singh Date: Mon, 18 May 2026 09:17:20 +0530 Subject: [PATCH 11/11] chore: adds new claims for sources Signed-off-by: Amit Singh --- claims/{al-jazeera => al_jazeera}/eu-sa-trade-deal.yaml | 0 claims/{al-jazeera => al_jazeera}/us-tarrifs.yaml | 0 claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml | 4 ++++ claims/bbc/russia_s_shadow_fleet_ships_de.yaml | 4 ++++ claims/{ht => hindustan_times}/china-zero-tarrif.yaml | 0 claims/{ht => hindustan_times}/india-gdp.yaml | 0 claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml | 4 ++++ claims/india_times/rrb_alp_recruitment_2026_regi.yaml | 4 ++++ claims/ndtv/ebola_outbreak_in_congo_kills.yaml | 4 ++++ .../china-dutch-sanctions.yaml | 0 .../congo_ebola_outbreak_constant.yaml | 4 ++++ .../moore-threads-share.yaml | 0 claims/the_guardian/timmy_the_whale_confirmed_dead.yaml | 4 ++++ claims/{nyt => the_new_york_times}/ai-backlash.yaml | 0 claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml | 4 ++++ claims/{nyt => the_new_york_times}/pentagon-google-ai.yaml | 0 claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml | 4 ++++ requirements.txt | 4 +++- 18 files changed, 39 insertions(+), 1 deletion(-) rename claims/{al-jazeera => al_jazeera}/eu-sa-trade-deal.yaml (100%) rename claims/{al-jazeera => al_jazeera}/us-tarrifs.yaml (100%) create mode 100644 claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml create mode 100644 claims/bbc/russia_s_shadow_fleet_ships_de.yaml rename claims/{ht => hindustan_times}/china-zero-tarrif.yaml (100%) rename claims/{ht => hindustan_times}/india-gdp.yaml (100%) create mode 100644 claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml create mode 100644 claims/india_times/rrb_alp_recruitment_2026_regi.yaml create mode 100644 claims/ndtv/ebola_outbreak_in_congo_kills.yaml rename claims/{scmp => south_china_morning_post}/china-dutch-sanctions.yaml (100%) create mode 100644 claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml rename claims/{scmp => south_china_morning_post}/moore-threads-share.yaml (100%) create mode 100644 claims/the_guardian/timmy_the_whale_confirmed_dead.yaml rename claims/{nyt => the_new_york_times}/ai-backlash.yaml (100%) create mode 100644 claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml rename claims/{nyt => the_new_york_times}/pentagon-google-ai.yaml (100%) create mode 100644 claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml diff --git a/claims/al-jazeera/eu-sa-trade-deal.yaml b/claims/al_jazeera/eu-sa-trade-deal.yaml similarity index 100% rename from claims/al-jazeera/eu-sa-trade-deal.yaml rename to claims/al_jazeera/eu-sa-trade-deal.yaml diff --git a/claims/al-jazeera/us-tarrifs.yaml b/claims/al_jazeera/us-tarrifs.yaml similarity index 100% rename from claims/al-jazeera/us-tarrifs.yaml rename to claims/al_jazeera/us-tarrifs.yaml diff --git a/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml b/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml new file mode 100644 index 0000000..72ce1e5 --- /dev/null +++ b/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "d77fe702137b27c8c8f04538dde3a2a48a1c9849d5e116e58aec80b229f4632f" +"summary": "Zimbabwean digital influencers appear to influence diaspora investment decisions through social media." +"title": "Zimbabwe’s diaspora reshapes real estate and farming investment trends" +"uri": "https://www.aljazeera.com/features/2026/5/16/zimbabwes-diaspora-reshapes-real-estate-and-farming-investment-trends" diff --git a/claims/bbc/russia_s_shadow_fleet_ships_de.yaml b/claims/bbc/russia_s_shadow_fleet_ships_de.yaml new file mode 100644 index 0000000..f7c10fc --- /dev/null +++ b/claims/bbc/russia_s_shadow_fleet_ships_de.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "ff03f7306b36e74d5dad4a7b18f821862b90afd53cc6f01dc7fd3b07796b1404" +"summary": "Almost 200 so-called Russian \"shadow fleet\" vessels have entered UK waters since the prime minister threatened to intercept them nearly seven weeks ago, BBC Verify analysis suggests." +"title": "Russia's shadow fleet ships defying PM's threat and entering UK waters" +"uri": "https://www.bbc.com/news/articles/cn8pvgw802no" diff --git a/claims/ht/china-zero-tarrif.yaml b/claims/hindustan_times/china-zero-tarrif.yaml similarity index 100% rename from claims/ht/china-zero-tarrif.yaml rename to claims/hindustan_times/china-zero-tarrif.yaml diff --git a/claims/ht/india-gdp.yaml b/claims/hindustan_times/india-gdp.yaml similarity index 100% rename from claims/ht/india-gdp.yaml rename to claims/hindustan_times/india-gdp.yaml diff --git a/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml b/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml new file mode 100644 index 0000000..a9ddfdc --- /dev/null +++ b/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "16e7233814ff26725174c87e80e6da033de4feb7ea3fa5a904ef75dbc17541f3" +"summary": "Earlier in the week, he had asked ministers to show exemplary public conduct, reduce vehicles in their fleets by 50%, and develop a new work culture based on austerity and energy conservation in Uttar Pradesh." +"title": "Yogi reduces convoy size in Gorakhpur amid austerity drive" +"uri": "https://www.hindustantimes.com/cities/lucknow-news/yogi-reduces-convoy-size-in-gorakhpur-amid-austerity-drive-101778959831426.html" diff --git a/claims/india_times/rrb_alp_recruitment_2026_regi.yaml b/claims/india_times/rrb_alp_recruitment_2026_regi.yaml new file mode 100644 index 0000000..b914efa --- /dev/null +++ b/claims/india_times/rrb_alp_recruitment_2026_regi.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "6472f88917d973b804e99cf32fcb87269d2557252739bc7bc137d8aa2da7b581" +"summary": "Indian Railways is recruiting 11,127 Assistant Loco Pilots. Applications are open from May 15 to June 14, 2026. Candidates need an ITI certificate or engineering diploma. The selection involves computer-based tests, an aptitude test, document verification, and a medical exam. Selected candidates will earn ₹19,900 basic pay. This is a significant opportunity for aspiring railway professionals." +"title": "RRB ALP recruitment 2026: Registration underway for 11,127 posts; check direct link to apply here" +"uri": "https://timesofindia.indiatimes.com/education/news/rrb-alp-recruitment-2026-registration-underway-for-11127-posts-check-direct-link-to-apply-here/articleshow/131149032.cms" diff --git a/claims/ndtv/ebola_outbreak_in_congo_kills.yaml b/claims/ndtv/ebola_outbreak_in_congo_kills.yaml new file mode 100644 index 0000000..de6283c --- /dev/null +++ b/claims/ndtv/ebola_outbreak_in_congo_kills.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "bc65553a778a1c832967d1ff1fb318597ff5d87f5f935f6e76820cc514fb287f" +"summary": "A new Ebola outbreak in the Democratic Republic of Congo that has caused scores of deaths has a \"very high lethality rate\" and no vaccine nor specific treatment..." +"title": "Ebola Outbreak In Congo Kills 80; Health Minister Warns Strain Has No Vaccine" +"uri": "https://www.ndtv.com/health/ebola-outbreak-in-congo-kills-80-health-minister-warns-strain-has-no-vaccine-11505320" diff --git a/claims/scmp/china-dutch-sanctions.yaml b/claims/south_china_morning_post/china-dutch-sanctions.yaml similarity index 100% rename from claims/scmp/china-dutch-sanctions.yaml rename to claims/south_china_morning_post/china-dutch-sanctions.yaml diff --git a/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml b/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml new file mode 100644 index 0000000..74e9f93 --- /dev/null +++ b/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "3295a82ed433fc6cfd4595d65a877b3d3bf55b313f467f2b584658e0df6ec3ee" +"summary": "At least 80 deaths have been reported in Congo’s new Ebola disease outbreak in the eastern Ituri province, authorities said, as health workers raced to intensify screening and contact tracing to contain the disease on Saturday. Officials first announced the outbreak on Friday, with 65 deaths and 246 suspected cases. In Ituri’s capital, Bunia, locals recounted their fears amid constant burials. “Every day, people are dying ... and this has been going on for about a week. In a single day, we bury..." +"title": "Congo Ebola outbreak: constant burials as death toll reaches 80" +"uri": "https://www.scmp.com/news/world/africa/article/3353823/congo-ebola-outbreak-constant-burials-death-toll-reaches-80" diff --git a/claims/scmp/moore-threads-share.yaml b/claims/south_china_morning_post/moore-threads-share.yaml similarity index 100% rename from claims/scmp/moore-threads-share.yaml rename to claims/south_china_morning_post/moore-threads-share.yaml diff --git a/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml b/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml new file mode 100644 index 0000000..e9ac352 --- /dev/null +++ b/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "619af5b3da55bc1a969dade318bdb7cfa47011fe471235e97e162388450c01a2" +"summary": "Humpback had been found deceased on Friday after rescue attempt criticised as ‘pure animal cruelty’ Timmy the whale has been confirmed dead by Danish authorities two weeks after the beached humpback was transported to the North Sea in a rescue attempt criticised as “pure animal cruelty”. Denmark’s Environmental Protection Agency said a whale had been found dead on Friday near ​the small ⁠island of Anholt in the Kattegat, a broad strait between Denmark and Sweden, and confirmed it was Timmy on Saturday. Continue reading..." +"title": "Timmy the whale confirmed dead by Danish authorities" +"uri": "https://www.theguardian.com/environment/2026/may/16/timmy-the-whale-confirmed-dead-by-danish-authorities" diff --git a/claims/nyt/ai-backlash.yaml b/claims/the_new_york_times/ai-backlash.yaml similarity index 100% rename from claims/nyt/ai-backlash.yaml rename to claims/the_new_york_times/ai-backlash.yaml diff --git a/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml b/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml new file mode 100644 index 0000000..8a83c34 --- /dev/null +++ b/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "f217d823160d39ca379a09044e1242ac641c8b1d79a1d6f499c59f08a1b67500" +"summary": "One of his country’s richest businessmen, he fled arrest and escaped prosecution for allegedly financing and directing the bloodletting that cost 800,000 lives in 1994." +"title": "Félicien Kabuga Dies; an Accused Mastermind of Rwanda’s Genocide" +"uri": "https://www.nytimes.com/2026/05/16/world/africa/felicien-kabuga-dead.html" diff --git a/claims/nyt/pentagon-google-ai.yaml b/claims/the_new_york_times/pentagon-google-ai.yaml similarity index 100% rename from claims/nyt/pentagon-google-ai.yaml rename to claims/the_new_york_times/pentagon-google-ai.yaml diff --git a/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml b/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml new file mode 100644 index 0000000..b52a3dd --- /dev/null +++ b/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "844adf50f74b30951a08a382fffc9f1a904acd2d62e062db41d2647c72f904a4" +"summary": "Pet Valu (TSE:PET) reported higher first-quarter 2026 revenue but lower profitability as Canadian pet owners leaned more heavily into promotions and value..." +"title": "Pet Valu Q1 Earnings Call Highlights" +"uri": "https://finance.yahoo.com/markets/stocks/articles/pet-valu-q1-earnings-call-160610682.html" diff --git a/requirements.txt b/requirements.txt index 4261feb..9131a18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ pyyaml jsonschema referencing -firecrawl-py \ No newline at end of file +firecrawl-py +requests +demjson3 \ No newline at end of file