Merge pull request #38 from JustInternetAI/Chloe

chllein · web-flow · commit 66e9717c42e2 · 2025-08-01T12:27:36.000-05:00
Chloe
diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py
@@ -31,6 +31,13 @@ def save_entry(entry, using_celery):
     #check if the entry has already been saved and if it has not then save it
     entry_hash = entry["id"]
     if collection.count_documents({"id": entry_hash}) == 0:
+
+        collection.insert_one(entry)
+        print(f"I have now saved: {entry['title']}")
+
+def update_article(article_id: str, updates: dict):
+    collection.update_one({"id": article_id}, {"$set": updates})
+
         result = collection.insert_one(entry)
         inserted_id = result.inserted_id
 
@@ -48,3 +55,4 @@ def save_entry(entry, using_celery):
 
 
         #print(f"I have now saved: {entry['title']}")
+
diff --git a/src/justinsight/__init__.py b/src/justinsight/__init__.py
diff --git a/src/nlp/README.md b/src/nlp/README.md
@@ -0,0 +1 @@
+To run cli.py: python -m nlp.cli --article-id=abc123
diff --git a/src/nlp/cli.py b/src/nlp/cli.py
@@ -0,0 +1,18 @@
+import argparse
+from nlp.core import process_article  # adjust if your actual import path differs
+
+def run_cli(article_id: str):
+    entities = process_article(article_id)
+    if entities is not None:
+        print(f"Extracted {len(entities)} entities from article {article_id}")
+    else:
+        print(f"No article found or article was already processed: {article_id}")
+
+def main():
+    parser = argparse.ArgumentParser(description="Run NER on a single article")
+    parser.add_argument("--article-id", required=True, help="ID of the article to process")
+    args = parser.parse_args()
+    run_cli(args.article_id)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/nlp/core.py b/src/nlp/core.py
@@ -0,0 +1,36 @@
+from transformers import pipeline
+from ingest.save_to_database import collection, update_article
+
+ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
+
+def run_ner_hf(text: str):
+    return ner(text)
+
+def process_article(article_id: str):
+    #Retrieve article by ID
+    article = collection.find_one({"id": article_id})
+
+    if not article:
+        print(f"No article found with ID: {article_id}")
+        return []
+
+    if article.get("processed") is True:
+        print(f"Article {article_id} already processed.")
+        return []
+
+    full_text = article.get("full_text", "")
+    if not full_text:
+        print(f"Article {article_id} has no full text.")
+        return []
+
+    # Run NER
+    entities = run_ner_hf(full_text)
+
+    # Update article in DB
+    update_article(article_id, {
+        "ner": entities,
+        "processed": True
+    })
+
+    return entities
+
diff --git a/src/nlp/requirements.txt b/src/nlp/requirements.txt
@@ -0,0 +1,6 @@
+pip install transformers
+pip install torch
+pip install celery
+pip install click
+pip install pytest
+pip install pymongo
diff --git a/src/nlp/tasks.py b/src/nlp/tasks.py
@@ -0,0 +1,33 @@
+from celery import Celery
+from ingest.save_to_database import collection
+from nlp.core import run_ner_hf, process_article
+
+app = Celery("justinsight")  # Use your actual Celery config if not centralized here
+
+@app.task
+def ner_task(article_id: str):
+    # Fetch the article by ID
+    article = collection.find_one({"id": article_id})
+
+    if not article:
+        print(f"No article found with ID: {article_id}")
+        return
+
+    text = article.get("full_text") or article.get("summary")
+    if not text:
+        print(f"No usable text found in article {article_id}")
+        return
+
+    # Run Named Entity Recognition
+    entities = run_ner_hf(text)
+
+    # Process article with NER results
+    process_article(article_id, {
+        "entities": entities,
+        "ner_processed": True  # flag added here
+    })
+
+    print(collection)
+    print(collection.__module__)
+    print(f"Processed article {article_id} with {len(entities)} entities.")
+    return entities
diff --git a/src/nlp/tests/test_cli.py b/src/nlp/tests/test_cli.py
@@ -0,0 +1,9 @@
+from unittest.mock import patch
+from nlp.cli import run_cli
+
+def test_run_cli_prints_extracted_message(capfd):
+    with patch("nlp.cli.process_article", return_value=[{"entity_group": "PER"}]):
+        run_cli("dummy-id")
+        out, _ = capfd.readouterr()
+        assert "Extracted 1 entities" in out
+
diff --git a/src/nlp/tests/test_core.py b/src/nlp/tests/test_core.py
@@ -0,0 +1,17 @@
+from nlp.core import run_ner_hf
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+
+def test_run_ner_hf():
+    sample_text = "Grace Madison hates the Mariners."
+    entities = run_ner_hf(sample_text)
+
+    print("Entities:", entities)
+
+    assert isinstance(entities, list)
+    assert all(isinstance(ent, dict) for ent in entities)
+
+    for ent in entities:
+        print("Entity keys:", ent.keys())
+
+    assert any(ent.get("entity_group") == 'PER' for ent in entities)  # Example check that a person entity is found
diff --git a/src/nlp/tests/test_tasks.py b/src/nlp/tests/test_tasks.py
@@ -0,0 +1,22 @@
+from unittest.mock import patch, ANY
+
+@patch("nlp.core.process_article")
+@patch("ingest.save_to_database.collection")
+def test_ner_task_calls_process_article(mock_collection, mock_process_article):
+    # Mock the DB find_one call
+    mock_collection.find_one.return_value = {
+        "id": "dummy_article_id",
+        "full_text": "Some article text",
+    }
+    
+    mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}]
+    
+    from nlp.tasks import ner_task
+    result = ner_task("dummy_article_id")
+    
+    mock_process_article.assert_called_once_with("dummy_article_id", {
+    "entities": ANY,
+    "ner_processed": True
+    })
+    
+    assert result == []

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+To run cli.py: python -m nlp.cli --article-id=abc123`