forked from rapmd73/AxiomEngine
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrucible.py
More file actions
127 lines (108 loc) · 5.62 KB
/
crucible.py
File metadata and controls
127 lines (108 loc) · 5.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Axiom - crucible.py
# Copyright (C) 2025 The Axiom Contributors
# This program is licensed under the Peer Production License (PPL).
# See the LICENSE file for full details.
# --- V2.2: ADDED TEXT SANITIZATION PRE-PROCESSOR ---
import spacy
import hashlib
import re
from ledger import (
get_all_facts_for_analysis,
mark_facts_as_disputed,
find_similar_fact_from_different_domain,
update_fact_corroboration,
insert_uncorroborated_fact
)
NLP_MODEL = spacy.load("en_core_web_sm")
SUBJECTIVITY_INDICATORS = {
'believe', 'think', 'feel', 'seems', 'appears', 'argues', 'suggests',
'contends', 'opines', 'speculates', 'especially', 'notably', 'remarkably',
'surprisingly', 'unfortunately', 'clearly', 'obviously', 'reportedly',
'allegedly', 'routinely', 'likely', 'apparently', 'essentially', 'largely',
'wedded to', 'new heights', 'war on facts', 'playbook', 'art of',
'therefore', 'consequently', 'thus', 'hence', 'conclusion',
'untrue', 'false', 'incorrect', 'correctly', 'rightly', 'wrongly',
'inappropriate', 'disparage', 'sycophants', 'unwelcome', 'flatly'
}
def _sanitize_and_preprocess_text(text):
"""
Cleans up extracted text before NLP analysis. Now includes a metadata filter
to remove common noise from web articles.
"""
# --- THIS IS THE UPGRADE ---
# Rule 1: Remove read times and comment counts (e.g., "42 2 min read ")
# This regex looks for digits and spaces at the start, followed by "min read".
text = re.sub(r'^\d+[\d\s]*min read\s*', '', text)
# Rule 2: Remove "Advertisement" from the start of a sentence (case-insensitive)
if text.lower().startswith('advertisement '):
text = text[14:] # Slice the string to remove "Advertisement "
# Rule 3 (from before): Fix run-on sentences common in topic pages
text = re.sub(r'(\d{4})([A-Z])', r'\1. \2', text)
# Rule 4 (from before): Standardize all whitespace to single spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def _get_subject_and_object(doc):
"""A helper function to extract the main subject and object from a spaCy doc."""
subject = None
d_object = None
for token in doc:
if "nsubj" in token.dep_:
subject = token.lemma_.lower()
if "dobj" in token.dep_ or "pobj" in token.dep_ or "attr" in token.dep_:
d_object = token.lemma_.lower()
return subject, d_object
def _check_for_contradiction(new_fact_doc, all_existing_facts):
"""Analyzes a new fact against all existing facts to find a direct contradiction."""
new_subject, new_object = _get_subject_and_object(new_fact_doc)
if not new_subject or not new_object: return None
for existing_fact in all_existing_facts:
if existing_fact['status'] == 'disputed': continue
existing_fact_doc = NLP_MODEL(existing_fact['fact_content'])
existing_subject, existing_object = _get_subject_and_object(existing_fact_doc)
if new_subject == existing_subject and new_object != existing_object:
new_is_negated = any(tok.dep_ == 'neg' for tok in new_fact_doc)
existing_is_negated = any(tok.dep_ == 'neg' for tok in existing_fact_doc)
if new_is_negated != existing_is_negated or (not new_is_negated and not existing_is_negated):
return existing_fact
return None
def extract_facts_from_text(source_url, text_content):
"""
The main V2.2 Crucible pipeline. It now sanitizes text before analysis.
"""
print(f"\n--- [The Crucible] Analyzing content from {source_url[:60]}...")
newly_created_facts = []
try:
source_domain_match = re.search(r'https?://(?:www\.)?([^/]+)', source_url)
if not source_domain_match: return newly_created_facts
source_domain = source_domain_match.group(1)
# Step 1: Sanitize the text
sanitized_text = _sanitize_and_preprocess_text(text_content)
all_facts_in_ledger = get_all_facts_for_analysis()
doc = NLP_MODEL(sanitized_text)
for sent in doc.sents:
if len(sent.text.split()) < 8 or len(sent.text.split()) > 100: continue
if not sent.ents: continue
if any(indicator in sent.text.lower() for indicator in SUBJECTIVITY_INDICATORS): continue
fact_content = sent.text.strip()
new_fact_doc = NLP_MODEL(fact_content)
# Step 2: Check for Contradictions
contradictory_fact = _check_for_contradiction(new_fact_doc, all_facts_in_ledger)
if contradictory_fact:
new_fact_id = hashlib.sha256(fact_content.encode('utf-8')).hexdigest()
mark_facts_as_disputed(contradictory_fact['fact_id'], new_fact_id, fact_content, source_url)
continue
# Step 3: Check for Corroboration
# We now correctly pass all_facts_in_ledger to the function.
similar_fact = find_similar_fact_from_different_domain(fact_content, source_domain, all_facts_in_ledger)
if similar_fact:
update_fact_corroboration(similar_fact['fact_id'], source_url)
continue
# Step 4: Insert as a new, uncorroborated fact
fact_id = hashlib.sha256(fact_content.encode('utf-8')).hexdigest()
new_fact_data = insert_uncorroborated_fact(fact_id, fact_content, source_url)
if new_fact_data:
newly_created_facts.append(new_fact_data)
except Exception as e:
print(f"[The Crucible] ERROR: Failed to process text. {e}")
print(f"[The Crucible] Analysis complete. Created {len(newly_created_facts)} new facts.")
return newly_created_facts