File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -31,6 +31,13 @@ def save_entry(entry, using_celery):
3131 #check if the entry has already been saved and if it has not then save it
3232 entry_hash = entry ["id" ]
3333 if collection .count_documents ({"id" : entry_hash }) == 0 :
34+
35+ collection .insert_one (entry )
36+ print (f"I have now saved: { entry ['title' ]} " )
37+
38+ def update_article (article_id : str , updates : dict ):
39+ collection .update_one ({"id" : article_id }, {"$set" : updates })
40+
3441 result = collection .insert_one (entry )
3542 inserted_id = result .inserted_id
3643
@@ -48,3 +55,4 @@ def save_entry(entry, using_celery):
4855
4956
5057 #print(f"I have now saved: {entry['title']}")
58+
Original file line number Diff line number Diff line change 1+ To run cli.py: python -m nlp.cli --article-id=abc123
Original file line number Diff line number Diff line change 1+ import argparse
2+ from nlp .core import process_article # adjust if your actual import path differs
3+
4+ def run_cli (article_id : str ):
5+ entities = process_article (article_id )
6+ if entities is not None :
7+ print (f"Extracted { len (entities )} entities from article { article_id } " )
8+ else :
9+ print (f"No article found or article was already processed: { article_id } " )
10+
11+ def main ():
12+ parser = argparse .ArgumentParser (description = "Run NER on a single article" )
13+ parser .add_argument ("--article-id" , required = True , help = "ID of the article to process" )
14+ args = parser .parse_args ()
15+ run_cli (args .article_id )
16+
17+ if __name__ == "__main__" :
18+ main ()
Original file line number Diff line number Diff line change 1+ from transformers import pipeline
2+ from ingest .save_to_database import collection , update_article
3+
4+ ner = pipeline ("ner" , model = "dslim/bert-base-NER" , aggregation_strategy = "simple" )
5+
6+ def run_ner_hf (text : str ):
7+ return ner (text )
8+
9+ def process_article (article_id : str ):
10+ #Retrieve article by ID
11+ article = collection .find_one ({"id" : article_id })
12+
13+ if not article :
14+ print (f"No article found with ID: { article_id } " )
15+ return []
16+
17+ if article .get ("processed" ) is True :
18+ print (f"Article { article_id } already processed." )
19+ return []
20+
21+ full_text = article .get ("full_text" , "" )
22+ if not full_text :
23+ print (f"Article { article_id } has no full text." )
24+ return []
25+
26+ # Run NER
27+ entities = run_ner_hf (full_text )
28+
29+ # Update article in DB
30+ update_article (article_id , {
31+ "ner" : entities ,
32+ "processed" : True
33+ })
34+
35+ return entities
36+
Original file line number Diff line number Diff line change 1+ pip install transformers
2+ pip install torch
3+ pip install celery
4+ pip install click
5+ pip install pytest
6+ pip install pymongo
Original file line number Diff line number Diff line change 1+ from celery import Celery
2+ from ingest .save_to_database import collection
3+ from nlp .core import run_ner_hf , process_article
4+
5+ app = Celery ("justinsight" ) # Use your actual Celery config if not centralized here
6+
7+ @app .task
8+ def ner_task (article_id : str ):
9+ # Fetch the article by ID
10+ article = collection .find_one ({"id" : article_id })
11+
12+ if not article :
13+ print (f"No article found with ID: { article_id } " )
14+ return
15+
16+ text = article .get ("full_text" ) or article .get ("summary" )
17+ if not text :
18+ print (f"No usable text found in article { article_id } " )
19+ return
20+
21+ # Run Named Entity Recognition
22+ entities = run_ner_hf (text )
23+
24+ # Process article with NER results
25+ process_article (article_id , {
26+ "entities" : entities ,
27+ "ner_processed" : True # flag added here
28+ })
29+
30+ print (collection )
31+ print (collection .__module__ )
32+ print (f"Processed article { article_id } with { len (entities )} entities." )
33+ return entities
Original file line number Diff line number Diff line change 1+ from unittest .mock import patch
2+ from nlp .cli import run_cli
3+
4+ def test_run_cli_prints_extracted_message (capfd ):
5+ with patch ("nlp.cli.process_article" , return_value = [{"entity_group" : "PER" }]):
6+ run_cli ("dummy-id" )
7+ out , _ = capfd .readouterr ()
8+ assert "Extracted 1 entities" in out
9+
Original file line number Diff line number Diff line change 1+ from nlp .core import run_ner_hf
2+ import warnings
3+ warnings .filterwarnings ("ignore" , category = UserWarning )
4+
5+ def test_run_ner_hf ():
6+ sample_text = "Grace Madison hates the Mariners."
7+ entities = run_ner_hf (sample_text )
8+
9+ print ("Entities:" , entities )
10+
11+ assert isinstance (entities , list )
12+ assert all (isinstance (ent , dict ) for ent in entities )
13+
14+ for ent in entities :
15+ print ("Entity keys:" , ent .keys ())
16+
17+ assert any (ent .get ("entity_group" ) == 'PER' for ent in entities ) # Example check that a person entity is found
Original file line number Diff line number Diff line change 1+ from unittest .mock import patch , ANY
2+
3+ @patch ("nlp.core.process_article" )
4+ @patch ("ingest.save_to_database.collection" )
5+ def test_ner_task_calls_process_article (mock_collection , mock_process_article ):
6+ # Mock the DB find_one call
7+ mock_collection .find_one .return_value = {
8+ "id" : "dummy_article_id" ,
9+ "full_text" : "Some article text" ,
10+ }
11+
12+ mock_process_article .return_value = [{"entity" : "PERSON" , "word" : "Alice" }]
13+
14+ from nlp .tasks import ner_task
15+ result = ner_task ("dummy_article_id" )
16+
17+ mock_process_article .assert_called_once_with ("dummy_article_id" , {
18+ "entities" : ANY ,
19+ "ner_processed" : True
20+ })
21+
22+ assert result == []
You can’t perform that action at this time.
0 commit comments