diff --git a/claims/al-jazeera/eu-sa-trade-deal.yaml b/claims/al_jazeera/eu-sa-trade-deal.yaml similarity index 100% rename from claims/al-jazeera/eu-sa-trade-deal.yaml rename to claims/al_jazeera/eu-sa-trade-deal.yaml diff --git a/claims/al-jazeera/us-tarrifs.yaml b/claims/al_jazeera/us-tarrifs.yaml similarity index 100% rename from claims/al-jazeera/us-tarrifs.yaml rename to claims/al_jazeera/us-tarrifs.yaml diff --git a/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml b/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml new file mode 100644 index 0000000..72ce1e5 --- /dev/null +++ b/claims/al_jazeera/zimbabwe_s_diaspora_reshapes_r.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "d77fe702137b27c8c8f04538dde3a2a48a1c9849d5e116e58aec80b229f4632f" +"summary": "Zimbabwean digital influencers appear to influence diaspora investment decisions through social media." +"title": "Zimbabwe’s diaspora reshapes real estate and farming investment trends" +"uri": "https://www.aljazeera.com/features/2026/5/16/zimbabwes-diaspora-reshapes-real-estate-and-farming-investment-trends" diff --git a/claims/bbc/russia_s_shadow_fleet_ships_de.yaml b/claims/bbc/russia_s_shadow_fleet_ships_de.yaml new file mode 100644 index 0000000..f7c10fc --- /dev/null +++ b/claims/bbc/russia_s_shadow_fleet_ships_de.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "ff03f7306b36e74d5dad4a7b18f821862b90afd53cc6f01dc7fd3b07796b1404" +"summary": "Almost 200 so-called Russian \"shadow fleet\" vessels have entered UK waters since the prime minister threatened to intercept them nearly seven weeks ago, BBC Verify analysis suggests." +"title": "Russia's shadow fleet ships defying PM's threat and entering UK waters" +"uri": "https://www.bbc.com/news/articles/cn8pvgw802no" diff --git a/claims/ht/china-zero-tarrif.yaml b/claims/hindustan_times/china-zero-tarrif.yaml similarity index 100% rename from claims/ht/china-zero-tarrif.yaml rename to claims/hindustan_times/china-zero-tarrif.yaml diff --git a/claims/ht/india-gdp.yaml b/claims/hindustan_times/india-gdp.yaml similarity index 100% rename from claims/ht/india-gdp.yaml rename to claims/hindustan_times/india-gdp.yaml diff --git a/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml b/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml new file mode 100644 index 0000000..a9ddfdc --- /dev/null +++ b/claims/hindustan_times/yogi_reduces_convoy_size_in_go.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "16e7233814ff26725174c87e80e6da033de4feb7ea3fa5a904ef75dbc17541f3" +"summary": "Earlier in the week, he had asked ministers to show exemplary public conduct, reduce vehicles in their fleets by 50%, and develop a new work culture based on austerity and energy conservation in Uttar Pradesh." +"title": "Yogi reduces convoy size in Gorakhpur amid austerity drive" +"uri": "https://www.hindustantimes.com/cities/lucknow-news/yogi-reduces-convoy-size-in-gorakhpur-amid-austerity-drive-101778959831426.html" diff --git a/claims/india_times/rrb_alp_recruitment_2026_regi.yaml b/claims/india_times/rrb_alp_recruitment_2026_regi.yaml new file mode 100644 index 0000000..b914efa --- /dev/null +++ b/claims/india_times/rrb_alp_recruitment_2026_regi.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "6472f88917d973b804e99cf32fcb87269d2557252739bc7bc137d8aa2da7b581" +"summary": "Indian Railways is recruiting 11,127 Assistant Loco Pilots. Applications are open from May 15 to June 14, 2026. Candidates need an ITI certificate or engineering diploma. The selection involves computer-based tests, an aptitude test, document verification, and a medical exam. Selected candidates will earn ₹19,900 basic pay. This is a significant opportunity for aspiring railway professionals." +"title": "RRB ALP recruitment 2026: Registration underway for 11,127 posts; check direct link to apply here" +"uri": "https://timesofindia.indiatimes.com/education/news/rrb-alp-recruitment-2026-registration-underway-for-11127-posts-check-direct-link-to-apply-here/articleshow/131149032.cms" diff --git a/claims/ndtv/ebola_outbreak_in_congo_kills.yaml b/claims/ndtv/ebola_outbreak_in_congo_kills.yaml new file mode 100644 index 0000000..de6283c --- /dev/null +++ b/claims/ndtv/ebola_outbreak_in_congo_kills.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "bc65553a778a1c832967d1ff1fb318597ff5d87f5f935f6e76820cc514fb287f" +"summary": "A new Ebola outbreak in the Democratic Republic of Congo that has caused scores of deaths has a \"very high lethality rate\" and no vaccine nor specific treatment..." +"title": "Ebola Outbreak In Congo Kills 80; Health Minister Warns Strain Has No Vaccine" +"uri": "https://www.ndtv.com/health/ebola-outbreak-in-congo-kills-80-health-minister-warns-strain-has-no-vaccine-11505320" diff --git a/claims/scmp/china-dutch-sanctions.yaml b/claims/south_china_morning_post/china-dutch-sanctions.yaml similarity index 100% rename from claims/scmp/china-dutch-sanctions.yaml rename to claims/south_china_morning_post/china-dutch-sanctions.yaml diff --git a/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml b/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml new file mode 100644 index 0000000..74e9f93 --- /dev/null +++ b/claims/south_china_morning_post/congo_ebola_outbreak_constant.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "3295a82ed433fc6cfd4595d65a877b3d3bf55b313f467f2b584658e0df6ec3ee" +"summary": "At least 80 deaths have been reported in Congo’s new Ebola disease outbreak in the eastern Ituri province, authorities said, as health workers raced to intensify screening and contact tracing to contain the disease on Saturday. Officials first announced the outbreak on Friday, with 65 deaths and 246 suspected cases. In Ituri’s capital, Bunia, locals recounted their fears amid constant burials. “Every day, people are dying ... and this has been going on for about a week. In a single day, we bury..." +"title": "Congo Ebola outbreak: constant burials as death toll reaches 80" +"uri": "https://www.scmp.com/news/world/africa/article/3353823/congo-ebola-outbreak-constant-burials-death-toll-reaches-80" diff --git a/claims/scmp/moore-threads-share.yaml b/claims/south_china_morning_post/moore-threads-share.yaml similarity index 100% rename from claims/scmp/moore-threads-share.yaml rename to claims/south_china_morning_post/moore-threads-share.yaml diff --git a/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml b/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml new file mode 100644 index 0000000..e9ac352 --- /dev/null +++ b/claims/the_guardian/timmy_the_whale_confirmed_dead.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "619af5b3da55bc1a969dade318bdb7cfa47011fe471235e97e162388450c01a2" +"summary": "Humpback had been found deceased on Friday after rescue attempt criticised as ‘pure animal cruelty’ Timmy the whale has been confirmed dead by Danish authorities two weeks after the beached humpback was transported to the North Sea in a rescue attempt criticised as “pure animal cruelty”. Denmark’s Environmental Protection Agency said a whale had been found dead on Friday near ​the small ⁠island of Anholt in the Kattegat, a broad strait between Denmark and Sweden, and confirmed it was Timmy on Saturday. Continue reading..." +"title": "Timmy the whale confirmed dead by Danish authorities" +"uri": "https://www.theguardian.com/environment/2026/may/16/timmy-the-whale-confirmed-dead-by-danish-authorities" diff --git a/claims/nyt/ai-backlash.yaml b/claims/the_new_york_times/ai-backlash.yaml similarity index 100% rename from claims/nyt/ai-backlash.yaml rename to claims/the_new_york_times/ai-backlash.yaml diff --git a/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml b/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml new file mode 100644 index 0000000..8a83c34 --- /dev/null +++ b/claims/the_new_york_times/f_licien_kabuga_dies_an_accus.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "f217d823160d39ca379a09044e1242ac641c8b1d79a1d6f499c59f08a1b67500" +"summary": "One of his country’s richest businessmen, he fled arrest and escaped prosecution for allegedly financing and directing the bloodletting that cost 800,000 lives in 1994." +"title": "Félicien Kabuga Dies; an Accused Mastermind of Rwanda’s Genocide" +"uri": "https://www.nytimes.com/2026/05/16/world/africa/felicien-kabuga-dead.html" diff --git a/claims/nyt/pentagon-google-ai.yaml b/claims/the_new_york_times/pentagon-google-ai.yaml similarity index 100% rename from claims/nyt/pentagon-google-ai.yaml rename to claims/the_new_york_times/pentagon-google-ai.yaml diff --git a/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml b/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml new file mode 100644 index 0000000..b52a3dd --- /dev/null +++ b/claims/yahoo_finance/pet_valu_q1_earnings_call_high.yaml @@ -0,0 +1,4 @@ +"sourceUriDigest": "844adf50f74b30951a08a382fffc9f1a904acd2d62e062db41d2647c72f904a4" +"summary": "Pet Valu (TSE:PET) reported higher first-quarter 2026 revenue but lower profitability as Canadian pet owners leaned more heavily into promotions and value..." +"title": "Pet Valu Q1 Earnings Call Highlights" +"uri": "https://finance.yahoo.com/markets/stocks/articles/pet-valu-q1-earnings-call-160610682.html" diff --git a/oapi.yaml b/oapi.yaml index cebe347..f616c2d 100644 --- a/oapi.yaml +++ b/oapi.yaml @@ -613,6 +613,11 @@ components: x-oapi-codegen-extra-tags: binding: required validate: httpsurl + example: + sourceUriDigest: "8649a4126fb4fc9a750f432b729c8477398cf28ca241403b2cd36a6dc841f441" + summary: "The unemployment rate decreased by 2% in Q4 2024" + title: "Unemployment Rate Decrease" + uri: "https://www.nytimes.com/2024/unemployment-report" Claim: type: object diff --git a/requirements.txt b/requirements.txt index 4261feb..9131a18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ pyyaml jsonschema referencing -firecrawl-py \ No newline at end of file +firecrawl-py +requests +demjson3 \ No newline at end of file diff --git a/scripts/newsdata_io.py b/scripts/newsdata_io.py new file mode 100644 index 0000000..0070f47 --- /dev/null +++ b/scripts/newsdata_io.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +import glob +import demjson3 +import json +import os +import re +import sys +import time +from typing import Tuple +from urllib.request import Request, urlopen +import requests +import yaml + +# Query parameters constants +CATEGORY = "environment,technology,world" +LANGUAGE = "en" +REMOVE_DUPLICATE = "1" +SIZE = "10" +DATATYPE = "news,research,analysis,pressRelease" + +FALSIFIABLE_CLAIM_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md" +CLAIM_PER_SOURCE = 2 +FREE_MODELS_DOC = [ + "google/gemma-4-31b-it:free", + "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", + "openrouter/free" +] + + +def fetch_web_text(url: str) -> str: + with urlopen(url, timeout=10) as r: + return r.read().decode("utf-8") + +def get_sources(base_url: str, api_key: str): + endpoint = f"{base_url}/api/v1/sources" + headers = {"X-API-Key": api_key} + response = requests.get(endpoint, headers=headers, timeout=90) + if response.status_code != 200: + print(f"Error: Received status code {response.status_code}") + exit(1) + return response.json() + +def post_openrouter(base_url: str, api_key: str, payload: dict) -> Tuple[int, str]: + data = json.dumps(payload).encode("utf-8") + req = Request( + f"{base_url}/chat/completions", + data=data, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + ) + + status = -1 + try: + with urlopen(req, timeout=180) as r: + status = r.getcode() + body = r.read().decode("utf-8") + except Exception as e: + print(f"Error making request to OpenRouter API: {e}", file=sys.stderr) + body = "" + + return status, body + +def req_openrouter(base_url: str, api_key: str, payload: dict) -> str: + status, body = post_openrouter(base_url, api_key, payload) + + if status == 200: + try: + data = json.loads(body) + except Exception as e: + print(f"Failed to parse JSON response: {e}", file=sys.stderr) + print(body) + return "" + + # Extract assistant reply + reply = None + try: + reply = data["choices"][0]["message"]["content"] + return reply + except Exception as e: + print(f"Failed to access key [choices][0][message][content]: {e}", file=sys.stderr) + return "" + else: + print(f"Openrouter response status: {status}", file=sys.stderr) + return "" + +def filter_claims(base_url: str, api_key: str, falsifiable_claim_skill: str, claims): + filter_prompt = ( + "Following is a list of 10 articles published by the same news outlet. Each article is represented by a json string type element in the array\n\n" + f"{claims}" + "\n\nUse web search tool to visit the link for each article, access the content and then assess if it is a falsifiable claim." + "\nOut of these 10 articles, only return 1 article that best fits the falsifiable claim criterion." + "Prefer claims that have been made by the news source directly" + "Keep the json structure of the claims the same as the original schema in the input. Do not add remove, or modify any key or value in the json string." + "Only output the plain json array string that I can safely unmarshal." + "Do not format the string. Do not output anything else." + ) + + filtered_claims = "" + filtered_claims_list = [] + ctr = 1 + for model in FREE_MODELS_DOC: + payload = { + "model": model, + "messages": [ + {"role": "system", "content": falsifiable_claim_skill}, + { + "role": "user", + "content": filter_prompt + }, + ], + "tools": [ + {"type": "openrouter:web_search"} + ] + } + + filtered_claims = req_openrouter(base_url, api_key, payload) + if filtered_claims != None and filtered_claims != "" and filtered_claims != "[]": + # Replace all Python boolean and None values + filtered_claims = re.sub(r':\s*None\b', ': null', filtered_claims) + filtered_claims = re.sub(r':\s*False\b', ': false', filtered_claims) + filtered_claims = re.sub(r':\s*True\b', ': true', filtered_claims) + + # models often return the json string wrapped in a code block or with incompatible values + match = re.search(r'```(?:json)?\s*(.*?)\s*```', filtered_claims, re.DOTALL) + if match: + filtered_claims = match.group(1) + try: + filtered_claims = filtered_claims.replace("`", "'") + filtered_claims = filtered_claims.strip().replace("'", "\'") + filtered_claims = json.dumps(demjson3.decode(filtered_claims)) + filtered_claims_list = json.loads(filtered_claims) + except Exception as e: + print(f"Error: failed to unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr) + continue + + if filtered_claims_list != None and len(filtered_claims_list) != 0: + break + + # delay before sending the request again + time.sleep(30*ctr) + ctr += 1 + + if filtered_claims_list == None or len(filtered_claims_list) == 0: + print("Error: All models failed.", file=sys.stderr) + sys.exit(1) + + + return filtered_claims_list + +def get_claims(base_url: str, api_key: str, src_domain_url: str): + endpoint = f"{base_url}/latest" + params = { + "category": CATEGORY, + "language": LANGUAGE, + "removeduplicate": REMOVE_DUPLICATE, + "size": SIZE, + "datatype": DATATYPE, + "apikey": api_key, + "domainurl": src_domain_url + } + response = requests.get(endpoint, params=params, timeout=10) + if response.status_code != 200: + print(f"Error: couldn't fetch claims for {src_domain_url}: {response.status_code}") + return None + return response.json()["results"] + +def update_claim_fields(srcDigest: str, claim): + claim["sourceUriDigest"] = srcDigest + claim["summary"] = claim["description"] + claim["uri"] = claim["link"] + +def get_claim_docs(): + claims_dir = os.path.join(os.path.dirname(__file__), "..", "claims") + claims_dir = os.path.abspath(claims_dir) + + # Find all YAML files in claims directory and subdirectories + yaml_files = glob.glob(os.path.join(claims_dir, "**", "*.yaml"), recursive=True) + yaml_files.extend(glob.glob(os.path.join(claims_dir, "**", "*.yml"), recursive=True)) + + claims_array = [] + for yaml_file in yaml_files: + try: + with open(yaml_file, 'r') as f: + claim_data = yaml.safe_load(f) + if claim_data: # Only add if file is not empty + claims_array.append(claim_data) + except Exception as e: + print(f"Error loading YAML file {yaml_file}: {e}", file=sys.stderr) + + return claims_array + +def is_claim_new(claim) -> bool: + claim_docs = get_claim_docs() + + for claim_doc in claim_docs: + if claim["uri"] == claim_doc["uri"] or claim["title"] == claim_doc["title"]: + return False + + return True + +def clean_filepath(path: str, replace: str = "_") -> str: + cleaned = re.sub(r'[^a-zA-Z0-9_-]', replace, path) + cleaned = re.sub(f'{re.escape(replace)}+', replace, cleaned) + cleaned = cleaned.strip(replace) + return cleaned + +def create_claim_docs(claims: list, srcName: str): + with open('oapi.yaml', 'r') as f: + oapi_spec = yaml.safe_load(f) + + claim_input_schema = oapi_spec['components']['schemas']['ClaimInput'] + claim_example = claim_input_schema.get('example') + + # Custom representer to force double quotes around strings + def quoted_str_representer(dumper, data): + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"') + + yaml.add_representer(str, quoted_str_representer) + + for claim in claims: + claim_doc = {} + for key in claim_example.keys(): + claim_doc[key] = str(claim[key]) + + filename = claim_doc["title"].lower() + if len(filename) > 30: + filename = filename[:30] + filename = clean_filepath(filename) + filename = f"{filename}.yaml" + + dirname = srcName.lower() + if len(dirname) > 30: + dirname = dirname[:30] + dirname = clean_filepath(dirname) + + # Create file path + file_path = os.path.join("claims", dirname, filename) + + # Write claim_doc to YAML file + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(claim_doc, f, default_flow_style=False, allow_unicode=True, width=float('inf')) + + print(f"Created claim document: {file_path}") + +def main(): + base_url = os.environ["API_BASE_URL"] + api_key = os.environ["API_KEY"] + news_data_base_url = os.environ["NEWSDATA_API_BASE_URL"] + news_data_api_key = os.environ["NEWSDATA_API_KEY"] + openrouter_api_key = os.environ["OPENROUTER_API_KEY"] + openrouter_base_url = os.environ["OPENROUTER_API_BASE_URL"] + + sources = get_sources(base_url, api_key) + + # Fetch falsifiable claim skill + try: + falsifiable_claim_skill = fetch_web_text(FALSIFIABLE_CLAIM_SKILL_URL) + except Exception as e: + print(f"Error: failed to fetch skill from {FALSIFIABLE_CLAIM_SKILL_URL}: {e}", file=sys.stderr) + sys.exit(1) + + src_to_patch = set() + for source in sources: + domain_url = source["uri"] + if source["domainUrlNewsData"] != "": + domain_url = source["domainUrlNewsData"] + else: + src_to_patch.add(source["uriDigest"]) + + claims = get_claims(news_data_base_url, news_data_api_key, domain_url) + if claims == None or len(claims) == 0: + continue + + # add gaps between openrouter api requests + time.sleep(30) + # keep only those articles that can be classified as falsifiable claims + filtered_claims = filter_claims(openrouter_base_url, openrouter_api_key, falsifiable_claim_skill, claims) + + # list of new claims to be ingested + new_claims = [] + # keep only relevant fields in the claims + for claim in filtered_claims: + update_claim_fields(source["uriDigest"], claim) + if is_claim_new(claim): + new_claims.append(claim) + + create_claim_docs(new_claims, source["name"]) + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/scripts/openrouter.py b/scripts/openrouter.py index 277c1e9..4d0d464 100644 --- a/scripts/openrouter.py +++ b/scripts/openrouter.py @@ -26,9 +26,12 @@ MD_PROCESSING_SKILL_URL = "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/md-processing/SKILL.md" SOURCE_QUESTION = "What are the top 10 latest most popular news outlets in the world listed in this document? Only output URLs of these news outlets separated by new lines. Do not output anything else." FREE_MODELS_DOC = [ + "openai/gpt-oss-120b:free", "google/gemma-4-31b-it:free", "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", "google/gemma-4-26b-a4b-it:free", + "nvidia/nemotron-nano-12b-v2-vl:free", + "z-ai/glm-4.5-air:free" ] diff --git a/sources/Google-News.yaml b/sources/Google-News.yaml deleted file mode 100644 index 8771194..0000000 --- a/sources/Google-News.yaml +++ /dev/null @@ -1,4 +0,0 @@ -name: "Google News" -summary: "Google News is a news aggregator service that organizes and provides news from various publishers worldwide." -tags: "global" -uri: "https://news.google.com"