SatyaLens · semmet95 · May 18, 2026 · May 18, 2026 · May 18, 2026 · coderabbitai
diff --git a/oapi.yaml b/oapi.yaml
@@ -486,6 +486,17 @@ components:
           x-oapi-codegen-extra-tags:
             validate: httpsurl
             binding: required
+        domainUrlNewsData:
+          type: string
+          description: Domain url corresponding to newsdata.io `domainurl` parameter
+          example: "timesofindia.indiatimes.com,theguardian.com"
+          x-oapi-codegen-extra-tags:
+            validate: omitnil,nonempty,nospace
+      example:
+        name: "New York Times"
+        summary: "American daily newspaper based in New York City"
+        tags: "news,media,journalism"
+        uri: "https://www.nytimes.com"
 
     SourcePatchInput:
       type: object
@@ -512,6 +523,12 @@ components:
           example: "news,journalism,trusted"
           x-oapi-codegen-extra-tags:
             validate: omitnil,nospace,nonempty
+        domainUrlNewsData:
+          type: string
+          description: Domain url corresponding to newsdata.io `domainurl` parameter
+          example: "timesofindia.indiatimes.com,theguardian.com"
+          x-oapi-codegen-extra-tags:
+            validate: omitnil,nonempty,nospace
 
     Source:
       type: object
@@ -523,6 +540,7 @@ components:
       - tags
       - uri
       - uriDigest
+      - domainUrlNewsData
       properties:
         name:
           type: string
@@ -562,6 +580,12 @@ components:
           example: "https://www.nytimes.com"
           x-oapi-codegen-extra-tags:
             binding: required
+        domainUrlNewsData:
+          type: string
+          description: Domain url corresponding to newsdata.io `domainurl` parameter
+          example: "timesofindia.indiatimes.com,theguardian.com"
+          x-oapi-codegen-extra-tags:
+            binding: required
 
     CreateSourceResponse:
       type: object

diff --git a/scripts/helper.py b/scripts/helper.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Helper functions for scripts"""
+
+import json
+import demjson3
+import os
+import re
+import sys
+import requests
+import yaml
+from typing import Tuple
+from urllib.request import Request, urlopen
+
+def get_text_from_url(url: str) -> str:
+    with urlopen(url, timeout=60) as r:
+        return r.read().decode("utf-8")
+
+def get_text_from_file(path: str) -> str:
+    with open(path, 'r', encoding='utf-8') as f:
+        return f.read()
+
+def get_oapi_spec():
+    base_dir = os.path.dirname(os.path.dirname(__file__))
+    oapi_path = os.path.join(base_dir, "oapi.yaml")
+
+    with open(oapi_path, 'r', encoding='utf-8') as f:
+        return yaml.safe_load(f)
+
+def get_sources(base_url: str, api_key: str):
+    endpoint = f"{base_url}/api/v1/sources"
+    headers = {"X-API-Key": api_key}
+    response = requests.get(endpoint, headers=headers, timeout=90)
+    if response.status_code != 200:
+        print(f"Error: failed to get all sources: {response.status_code}")
+        return None
+    return response.json()
+
+def cleanup_json_str(json_str: str) -> str:
+    # models often return the json string wrapped in a code block or with incompatible values
+    match = re.search(r'```(?:json)?\s*(.*?)\s*```', json_str, re.DOTALL)
+    if match:
+        json_str = match.group(1)
+
+    # Replace all Python boolean and None values
+    json_str = re.sub(r':\s*None\b', ': null', json_str)
+    json_str = re.sub(r':\s*False\b', ': false', json_str)
+    json_str = re.sub(r':\s*True\b', ': true', json_str)
+
+    json_str = json_str.replace("`", "'")
+    json_str = json_str.strip().replace("'", "\'")
+    json_str = json.dumps(demjson3.decode(json_str))
+
+    return json_str
+
+def clean_filepath(path: str, replace: str = "_") -> str:
+    cleaned = re.sub(r'[^a-zA-Z0-9_-]', replace, path)
+    cleaned = re.sub(f'{re.escape(replace)}+', replace, cleaned)
+    cleaned = cleaned.strip(replace)
+    return cleaned
+
+def post_request(endpoint: str, headers: dict, payload: dict, timeout: int) -> Tuple[int, str]:
+    data = json.dumps(payload).encode("utf-8")
+    req = Request(
+        url=endpoint,
+        data=data,
+        headers=headers
+    )
+
+    try:
+        with urlopen(req, timeout=timeout) as r:
+            status = r.getcode()
+            body = r.read().decode("utf-8")
+    except Exception as e:
+        print(f"Error making request to {endpoint}: {e}", file=sys.stderr)
+        return 0, ""
+
+    return status, body
diff --git a/scripts/ingest_claims.py b/scripts/ingest_claims.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+
+import glob
+import json
+import os
+import sys
+import yaml
+
+import helper
+import newsdata_io
+import openrouter
+
+API_BASE_URL = os.environ["API_BASE_URL"]
+API_KEY = os.environ["API_KEY"]
+
+FALSIFIABLE_CLAIM_SKILL_URL = os.getenv(
+    "FALSIFIABLE_CLAIM_SKILL_URL",
+    "https://raw.githubusercontent.com/semmet95/agent-skills/refs/heads/main/determine-falsifialbe-claim/SKILL.md"
+)
+CLAIM_FILTER_PROMPT = os.getenv(
+    "CLAIM_FILTER_PROMPT",
+    (
+        "Use web search tool to visit the link for each article, access the content and then assess if it is a falsifiable claim."
+        "Out of these 10 articles, only return 1 article that best fits the falsifiable claim criterion."
+        "Prefer claims that have been made by the news source directly"
+        "Keep the json structure of the claims the same as the original schema in the input. Do not add remove, or modify any key or value in the json string."
+        "Only output the plain json array string that I can safely unmarshal."
+        "Do not format the string. Do not output anything else."
+    )
+)
+
+def update_claim_fields(srcDigest: str, claim):
+    claim["sourceUriDigest"] = srcDigest
+    claim["summary"] = claim["description"]
+    claim["uri"] = claim["link"]
+
+def get_claim_docs(claims_dir: str):
+    # Find all YAML files in claims directory and subdirectories
+    yaml_files = glob.glob(os.path.join(claims_dir, "**", "*.yaml"), recursive=True)
+    yaml_files.extend(glob.glob(os.path.join(claims_dir, "**", "*.yml"), recursive=True))
+
+    claims_array = []
+    for yaml_file in yaml_files:
+        try:
+            with open(yaml_file, 'r', encoding='utf-8') as f:
+                claim_data = yaml.safe_load(f)
+                if claim_data:  # Only add if file is not empty
+                    claims_array.append(claim_data)
+        except Exception as e:
+            print(f"Error loading YAML file {yaml_file}: {e}", file=sys.stderr)
+
+    return claims_array
+
+def get_new_claims(all_claim_docs, new_claims, srcUriDigest):
+    unique_claims = []
+
+    for claim in new_claims:
+        update_claim_fields(srcUriDigest, claim)
+        new_claim = True
+        for claim_doc in all_claim_docs:
+            if claim["uri"] == claim_doc["uri"] or claim["title"] == claim_doc["title"]:
+                new_claim = False
+                break
+        if new_claim:
+            unique_claims.append(claim)
+
+    return unique_claims
+
+def create_claim_docs(claims: list, srcName: str):
+    claim_input_schema = helper.get_oapi_spec()['components']['schemas']['ClaimInput']
+    claim_example = claim_input_schema.get('example')
+
+    # Custom representer to force double quotes around strings
+    def quoted_str_representer(dumper, data):
+        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"')
+
+    yaml.add_representer(str, quoted_str_representer)
+
+    for claim in claims:
+        claim_doc = {}
+        # keep only relevant fields in the claims
+        for key in claim_example.keys():
+            claim_doc[key] = str(claim[key])
+
+        filename = claim_doc["title"].lower()
+        if len(filename) > 30:
+            filename = filename[:30]
+        filename = helper.clean_filepath(filename)
+        filename = f"{filename}.yaml"
+
+        dirname = srcName.lower()
+        if len(dirname) > 30:
+            dirname = dirname[:30]
+        dirname = helper.clean_filepath(dirname)
+
+        # Create file path
+        file_path = os.path.join("claims", dirname, filename)
+
+        # avoid overwriting existing files
+        if os.path.exists(file_path):
+            print(f"Warning: claim file with name : {file_path} already exists", file=sys.stderr)
+            continue
+
+        # Write claim_doc to YAML file
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, 'w', encoding='utf-8') as f:
+            yaml.dump(claim_doc, f, default_flow_style=False, allow_unicode=True, width=float('inf'))
+
+        print(f"Created claim document: {file_path}")
+
+# TODO: maintain a list of sources for which domainUrlNewsData fields needs to be updated
+def main():
+    claims_dir = os.path.join(os.path.dirname(__file__), "..", "claims")
+    claims_dir = os.path.abspath(claims_dir)
+    all_claim_docs = get_claim_docs(claims_dir)
+
+    sources = helper.get_sources(API_BASE_URL, API_KEY)
+    if sources is None:
+        print(f"Error: failed to fetch all sources", file=sys.stderr)
+        sys.exit(1)
+
+    # Fetch falsifiable claim skill
+    try:
+        falsifiable_claim_skill = helper.get_text_from_url(FALSIFIABLE_CLAIM_SKILL_URL)
+    except Exception as e:
+        print(f"Error: failed to fetch skill from {FALSIFIABLE_CLAIM_SKILL_URL}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    for source in sources:
+        domain_url = source["uri"]
+        if source["domainUrlNewsData"] != "":
+            domain_url = source["domainUrlNewsData"]
+
+        claims = newsdata_io.get_claims(domain_url)
+        if claims is None or len(claims) == 0:
+            continue
+
+        # keep only those articles that can be classified as falsifiable claims
+        req_content = (
+            "Following is a list of 10 articles published by the same news outlet. Each article is represented by a json string type element in the array"
+            f"\n\n{claims}\n\n"
+            f"{CLAIM_FILTER_PROMPT}"
+        )
+        filtered_claims = openrouter.req_w_addons(req_content, skill=falsifiable_claim_skill, tools=[openrouter.WEB_SEARCH_TOOL])
+        if filtered_claims == "":
+            print(f"Error: failed to filter claims for source {source['name']}", file=sys.stderr)
+            continue
+
+        try:
+            filtered_claims_list = json.loads(helper.cleanup_json_str(filtered_claims))
+        except Exception as e:
+            print(f"Error: failed to cleanup and unmarshal claims json string {filtered_claims}: {e}", file=sys.stderr)
+            continue
+
+        if len(filtered_claims_list) == 0:
+            print(f"Error: no claims found for {source['name']} in: {filtered_claims}", file=sys.stderr)
+            continue
+
+        # list of new claims to be ingested
+        new_unique_claims = get_new_claims(all_claim_docs, filtered_claims_list, source["uriDigest"])        
+        create_claim_docs(new_unique_claims, source["name"])
+
+if __name__ == "__main__":
+    sys.exit(main())