Skip to content

Commit 66e9717

Browse files
authored
Merge pull request #38 from JustInternetAI/Chloe
Chloe
2 parents 2840c55 + f92582a commit 66e9717

10 files changed

Lines changed: 150 additions & 0 deletions

File tree

src/ingest/save_to_database.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ def save_entry(entry, using_celery):
3131
#check if the entry has already been saved and if it has not then save it
3232
entry_hash = entry["id"]
3333
if collection.count_documents({"id": entry_hash}) == 0:
34+
35+
collection.insert_one(entry)
36+
print(f"I have now saved: {entry['title']}")
37+
38+
def update_article(article_id: str, updates: dict):
39+
collection.update_one({"id": article_id}, {"$set": updates})
40+
3441
result = collection.insert_one(entry)
3542
inserted_id = result.inserted_id
3643

@@ -48,3 +55,4 @@ def save_entry(entry, using_celery):
4855

4956

5057
#print(f"I have now saved: {entry['title']}")
58+

src/justinsight/__init__.py

Whitespace-only changes.

src/nlp/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
To run cli.py: python -m nlp.cli --article-id=abc123

src/nlp/cli.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import argparse
2+
from nlp.core import process_article # adjust if your actual import path differs
3+
4+
def run_cli(article_id: str):
5+
entities = process_article(article_id)
6+
if entities is not None:
7+
print(f"Extracted {len(entities)} entities from article {article_id}")
8+
else:
9+
print(f"No article found or article was already processed: {article_id}")
10+
11+
def main():
12+
parser = argparse.ArgumentParser(description="Run NER on a single article")
13+
parser.add_argument("--article-id", required=True, help="ID of the article to process")
14+
args = parser.parse_args()
15+
run_cli(args.article_id)
16+
17+
if __name__ == "__main__":
18+
main()

src/nlp/core.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from transformers import pipeline
2+
from ingest.save_to_database import collection, update_article
3+
4+
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
5+
6+
def run_ner_hf(text: str):
7+
return ner(text)
8+
9+
def process_article(article_id: str):
10+
#Retrieve article by ID
11+
article = collection.find_one({"id": article_id})
12+
13+
if not article:
14+
print(f"No article found with ID: {article_id}")
15+
return []
16+
17+
if article.get("processed") is True:
18+
print(f"Article {article_id} already processed.")
19+
return []
20+
21+
full_text = article.get("full_text", "")
22+
if not full_text:
23+
print(f"Article {article_id} has no full text.")
24+
return []
25+
26+
# Run NER
27+
entities = run_ner_hf(full_text)
28+
29+
# Update article in DB
30+
update_article(article_id, {
31+
"ner": entities,
32+
"processed": True
33+
})
34+
35+
return entities
36+

src/nlp/requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pip install transformers
2+
pip install torch
3+
pip install celery
4+
pip install click
5+
pip install pytest
6+
pip install pymongo

src/nlp/tasks.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from celery import Celery
2+
from ingest.save_to_database import collection
3+
from nlp.core import run_ner_hf, process_article
4+
5+
app = Celery("justinsight") # Use your actual Celery config if not centralized here
6+
7+
@app.task
8+
def ner_task(article_id: str):
9+
# Fetch the article by ID
10+
article = collection.find_one({"id": article_id})
11+
12+
if not article:
13+
print(f"No article found with ID: {article_id}")
14+
return
15+
16+
text = article.get("full_text") or article.get("summary")
17+
if not text:
18+
print(f"No usable text found in article {article_id}")
19+
return
20+
21+
# Run Named Entity Recognition
22+
entities = run_ner_hf(text)
23+
24+
# Process article with NER results
25+
process_article(article_id, {
26+
"entities": entities,
27+
"ner_processed": True # flag added here
28+
})
29+
30+
print(collection)
31+
print(collection.__module__)
32+
print(f"Processed article {article_id} with {len(entities)} entities.")
33+
return entities

src/nlp/tests/test_cli.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from unittest.mock import patch
2+
from nlp.cli import run_cli
3+
4+
def test_run_cli_prints_extracted_message(capfd):
5+
with patch("nlp.cli.process_article", return_value=[{"entity_group": "PER"}]):
6+
run_cli("dummy-id")
7+
out, _ = capfd.readouterr()
8+
assert "Extracted 1 entities" in out
9+

src/nlp/tests/test_core.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from nlp.core import run_ner_hf
2+
import warnings
3+
warnings.filterwarnings("ignore", category=UserWarning)
4+
5+
def test_run_ner_hf():
6+
sample_text = "Grace Madison hates the Mariners."
7+
entities = run_ner_hf(sample_text)
8+
9+
print("Entities:", entities)
10+
11+
assert isinstance(entities, list)
12+
assert all(isinstance(ent, dict) for ent in entities)
13+
14+
for ent in entities:
15+
print("Entity keys:", ent.keys())
16+
17+
assert any(ent.get("entity_group") == 'PER' for ent in entities) # Example check that a person entity is found

src/nlp/tests/test_tasks.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from unittest.mock import patch, ANY
2+
3+
@patch("nlp.core.process_article")
4+
@patch("ingest.save_to_database.collection")
5+
def test_ner_task_calls_process_article(mock_collection, mock_process_article):
6+
# Mock the DB find_one call
7+
mock_collection.find_one.return_value = {
8+
"id": "dummy_article_id",
9+
"full_text": "Some article text",
10+
}
11+
12+
mock_process_article.return_value = [{"entity": "PERSON", "word": "Alice"}]
13+
14+
from nlp.tasks import ner_task
15+
result = ner_task("dummy_article_id")
16+
17+
mock_process_article.assert_called_once_with("dummy_article_id", {
18+
"entities": ANY,
19+
"ner_processed": True
20+
})
21+
22+
assert result == []

0 commit comments

Comments
 (0)