From 1aff72966ddb27633200a4d1a90f8da7af1700eb Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Tue, 26 Apr 2022 19:23:33 +0530 Subject: [PATCH 1/8] commented out NER classes not needed --- bootleg/utils/mention_extractor_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bootleg/utils/mention_extractor_utils.py b/bootleg/utils/mention_extractor_utils.py index 4cefc8a4..762a8980 100644 --- a/bootleg/utils/mention_extractor_utils.py +++ b/bootleg/utils/mention_extractor_utils.py @@ -43,16 +43,17 @@ PUNC = string.punctuation KEEP_POS = {"PROPN", "NOUN"} # ADJ, VERB, ADV, SYM PLURAL = {"s", "'s"} +## Customizing NER classes for testing NER_CLASSES = { "PERSON", - "NORP", + #"NORP", "ORG", "GPE", "LOC", - "PRODUCT", - "EVENT", - "WORK_OF_ART", - "LANGUAGE", + #"PRODUCT", + #"EVENT", + #"WORK_OF_ART", + #"LANGUAGE", } table = str.maketrans( dict.fromkeys(PUNC) From a833e389a26f473d1286e80d2489e2edb3c0ba58 Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Fri, 6 May 2022 19:16:06 +0530 Subject: [PATCH 2/8] comment out LOC tag --- bootleg/utils/mention_extractor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootleg/utils/mention_extractor_utils.py b/bootleg/utils/mention_extractor_utils.py index 762a8980..bf6fc6b0 100644 --- a/bootleg/utils/mention_extractor_utils.py +++ b/bootleg/utils/mention_extractor_utils.py @@ -49,7 +49,7 @@ #"NORP", "ORG", "GPE", - "LOC", + #"LOC", #"PRODUCT", #"EVENT", #"WORK_OF_ART", From a23f2b97e29cda0b5bc3b11f89254da98fde3716 Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Tue, 31 May 2022 12:51:46 +0530 Subject: [PATCH 3/8] bootleg framework commit --- bootleg/utils/mention_extractor_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bootleg/utils/mention_extractor_utils.py b/bootleg/utils/mention_extractor_utils.py index 762a8980..7a7da7e0 100644 --- a/bootleg/utils/mention_extractor_utils.py +++ b/bootleg/utils/mention_extractor_utils.py @@ -310,13 +310,13 @@ def my_mention_extractor( start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] - elif ( - str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] - in "GPE" - ): - str_main = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) - start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] - end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] + # elif ( + # str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] + # in "GPE" + # ): + # str_main = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) + # start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] + # end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] if str_main is not None and (start_pos != -1 and end_pos != -1): final_gram = None if str_main in all_aliases: From bc8840c83e047848e7188842024d60bec82e7bca Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Tue, 7 Jun 2022 07:26:50 +0530 Subject: [PATCH 4/8] updated bootleg code --- bootleg/end2end/bootleg_annotator.py | 49 +++++++++++++++++++++++- bootleg/utils/mention_extractor_utils.py | 34 +++++++++++----- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/bootleg/end2end/bootleg_annotator.py b/bootleg/end2end/bootleg_annotator.py index 5ea097bf..bdfea95e 100644 --- a/bootleg/end2end/bootleg_annotator.py +++ b/bootleg/end2end/bootleg_annotator.py @@ -1,5 +1,6 @@ """BootlegAnnotator.""" import logging +from operator import is_ import os import tarfile import urllib @@ -18,6 +19,7 @@ from bootleg.end2end.extract_mentions import MENTION_EXTRACTOR_OPTIONS from bootleg.symbols.constants import PAD_ID from bootleg.symbols.entity_symbols import EntitySymbols +from bootleg.symbols.entity_profile import EntityProfile from bootleg.symbols.kg_symbols import KGSymbols from bootleg.symbols.type_symbols import TypeSymbols from bootleg.task_config import NED_TASK @@ -34,6 +36,7 @@ } + def get_default_cache(): """Get default cache directory for saving Bootleg data.""" try: @@ -137,6 +140,7 @@ class BootlegAnnotator(object): cache_dir: cache directory (default None) model_name: model name (default None) entity_emb_file: entity embedding file (default None) + entity_dir_loc: entity directory location (default None) return_embs: whether to return embeddings or not (default False) extract_method: mention extraction method verbose: verbose boolean (default False) @@ -152,6 +156,7 @@ def __init__( cache_dir: str = None, model_name: str = None, entity_emb_file: str = None, + entity_dir_loc: str=None, return_embs: bool = False, extract_method: str = "ngram_spacy", verbose: bool = False, @@ -164,6 +169,7 @@ def __init__( self.return_embs = return_embs self.entity_emb_file = entity_emb_file self.extract_method = extract_method + self.entity_dir_location = entity_dir_loc if self.entity_emb_file is not None: assert Path( @@ -242,6 +248,10 @@ def __init__( alias_cand_map_dir=self.config.data_config.alias_cand_map, alias_idx_dir=self.config.data_config.alias_idx_map, ) + self.entity_profile = EntityProfile.load_from_cache(\ + load_dir=self.entity_dir_location,\ + no_type=True,edit_mode=False,\ + verbose=True) self.all_aliases_trie = self.entity_db.get_all_alias_vocabtrie() add_entity_type = self.config.data_config.entity_type_data.use_entity_types @@ -311,13 +321,22 @@ def extract_mentions(self, text): Returns: JSON object of sentence to be used in eval """ - found_aliases, found_spans, found_char_spans = MENTION_EXTRACTOR_OPTIONS[ + found_aliases, found_spans,\ + found_char_spans , \ + org_entity_list, \ + per_entity_list, \ + loc_list, \ + type_list = MENTION_EXTRACTOR_OPTIONS[ self.extract_method ](text, self.all_aliases_trie, self.min_alias_len, self.max_alias_len) return { "sentence": text, "aliases": found_aliases, "char_spans": found_char_spans, + "org_entity_list" : org_entity_list, + "per_entity_list" : per_entity_list, + "loc_entity_list" : loc_list, + "type_entity_list": type_list, "cands": [self.entity_db.get_qid_cands(al) for al in found_aliases], # we don't know the true QID "qids": ["Q-1" for i in range(len(found_aliases))], @@ -418,6 +437,12 @@ def label_mentions( batch_char_spans_arr = [] batch_example_aliases = [] batch_idx_unq = [] + + batch_ner_org_list=[] + batch_ner_per_list=[] + batch_ner_loc_list=[] + batch_ner_type_list=[] + for idx_unq in tqdm( range(num_exs), desc="Prepping data", @@ -426,6 +451,10 @@ def label_mentions( ): if do_extract_mentions: sample = self.extract_mentions(text_list[idx_unq]) + batch_ner_org_list.append(sample["org_entity_list"]) + batch_ner_per_list.append(sample["per_entity_list"]) + batch_ner_loc_list.append(sample["loc_entity_list"]) + batch_ner_type_list.append(sample["type_entity_list"]) else: sample = extracted_examples[idx_unq] # Add the unk qids and gold values @@ -523,6 +552,7 @@ def label_mentions( batch_example_true_entities = torch.tensor(batch_example_true_entities) final_pred_cands = [[] for _ in range(num_exs)] + final_pred_cands_types = [[] for _ in range(num_exs)] final_all_cands = [[] for _ in range(num_exs)] final_cand_probs = [[] for _ in range(num_exs)] final_pred_probs = [[] for _ in range(num_exs)] @@ -588,9 +618,21 @@ def label_mentions( pred_prob = max_probs[ex_i].item() pred_qid = entity_cands[pred_idx] if pred_prob > self.threshold: + is_org=False final_all_cands[idx_unq].append(entity_cands) final_cand_probs[idx_unq].append(probs_ex) final_pred_cands[idx_unq].append(pred_qid) + entity_relation_dict=self.entity_profile.get_relations_tails_for_qid(pred_qid) + if 'instance of' in entity_relation_dict: + instance_of_list = entity_relation_dict['instance of'] + if 'Q5' not in instance_of_list: ## Q5 means human + is_org=True + else: + is_org=True + if is_org: + final_pred_cands_types[idx_unq].append("ORG") + else: + final_pred_cands_types[idx_unq].append("PER") final_pred_probs[idx_unq].append(pred_prob) if self.return_embs: final_entity_embs[idx_unq].append( @@ -617,10 +659,15 @@ def label_mentions( "qids": final_pred_cands, "probs": final_pred_probs, "titles": final_titles, + "qid_types": final_pred_cands_types, "cands": final_all_cands, "cand_probs": final_cand_probs, "char_spans": final_char_spans, "aliases": final_aliases, + "org_entity_list": batch_ner_org_list, + "per_entity_list": batch_ner_per_list, + "loc_entity_list": batch_ner_loc_list, + "type_entity_list": batch_ner_type_list } if self.return_embs: res_dict["embs"] = final_entity_embs diff --git a/bootleg/utils/mention_extractor_utils.py b/bootleg/utils/mention_extractor_utils.py index 7a7da7e0..d02e1d87 100644 --- a/bootleg/utils/mention_extractor_utils.py +++ b/bootleg/utils/mention_extractor_utils.py @@ -288,8 +288,14 @@ def my_mention_extractor( """ sentence = Sentence(text) - tagger_fast.predict(sentence, mini_batch_size=16) + tagger_fast.predict(sentence, mini_batch_size=32) entities = [] + org_entities=[] + per_entities=[] + loc_entities=[] + type_entities=[] + is_org=False + is_per=False for i in range(len(sentence.to_dict(tag_type="ner")["entities"])): str_main = None start_pos = -1 @@ -301,6 +307,7 @@ def my_mention_extractor( str_main = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] + is_org=True elif ( str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] @@ -309,14 +316,21 @@ def my_mention_extractor( str_main = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] + is_per=True - # elif ( - # str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] - # in "GPE" - # ): - # str_main = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) - # start_pos = sentence.to_dict(tag_type="ner")["entities"][i]["start_pos"] - # end_pos = sentence.to_dict(tag_type="ner")["entities"][i]["end_pos"] + elif ( + str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] + in "GPE"): + loc_value = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) + loc_entities.append([loc_value]) + if is_org: + org_text_entity = str_main + org_entities.append([org_text_entity,start_pos,end_pos]) + type_entities.append(["ORG",start_pos,end_pos]) + if is_per: + per_text_entity = str_main + per_entities.append([per_text_entity,start_pos,end_pos]) + type_entities.append(["PER",start_pos,end_pos]) if str_main is not None and (start_pos != -1 and end_pos != -1): final_gram = None if str_main in all_aliases: @@ -331,8 +345,10 @@ def my_mention_extractor( final_gram = joined_gram_merged_noplural if final_gram is not None: entities.append([final_gram, start_pos, end_pos]) + is_org=False + is_per=False used_aliases = [item[0] for item in entities] chars = [[item[1], item[2]] for item in entities] spans = [[len(text[: sp[0]].split()), len(text[: sp[1]].split())] for sp in chars] - return used_aliases, spans, chars + return used_aliases, spans, chars , org_entities, per_entities , loc_entities , type_entities From 24eabc68bd93faa05cfb1397570186bfa6f0964b Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Tue, 7 Jun 2022 19:15:09 +0530 Subject: [PATCH 5/8] README file added to change the branch --- README.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.txt diff --git a/README.txt b/README.txt new file mode 100644 index 00000000..6facd9cb --- /dev/null +++ b/README.txt @@ -0,0 +1 @@ +Please switch to custom_mention_extractor branch after git submodule update --recursive \ No newline at end of file From d9e7f6a024bd3b3533dcf7202385af08d19c500b Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Fri, 10 Jun 2022 16:33:18 +0530 Subject: [PATCH 6/8] Bootleg changes --- bootleg/end2end/bootleg_annotator.py | 9 ++++++++- bootleg/utils/mention_extractor_utils.py | 16 ++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/bootleg/end2end/bootleg_annotator.py b/bootleg/end2end/bootleg_annotator.py index bdfea95e..d5538435 100644 --- a/bootleg/end2end/bootleg_annotator.py +++ b/bootleg/end2end/bootleg_annotator.py @@ -1,4 +1,8 @@ -"""BootlegAnnotator.""" +"""BootlegAnnotator. + +author shounak.kundu +""" + import logging from operator import is_ import os @@ -247,6 +251,7 @@ def __init__( ), alias_cand_map_dir=self.config.data_config.alias_cand_map, alias_idx_dir=self.config.data_config.alias_idx_map, + edit_mode=False ) self.entity_profile = EntityProfile.load_from_cache(\ load_dir=self.entity_dir_location,\ @@ -627,6 +632,8 @@ def label_mentions( instance_of_list = entity_relation_dict['instance of'] if 'Q5' not in instance_of_list: ## Q5 means human is_org=True + elif 'place of birth' in entity_relation_dict: + is_org=False else: is_org=True if is_org: diff --git a/bootleg/utils/mention_extractor_utils.py b/bootleg/utils/mention_extractor_utils.py index 1fe371a1..a5454465 100644 --- a/bootleg/utils/mention_extractor_utils.py +++ b/bootleg/utils/mention_extractor_utils.py @@ -2,7 +2,7 @@ import string from collections import namedtuple from typing import List, Tuple, Union - +import torch import nltk import spacy from spacy.cli.download import download as spacy_download @@ -10,6 +10,7 @@ from bootleg.symbols.constants import LANG_CODE from bootleg.utils.utils import get_lnrm + logger = logging.getLogger(__name__) span_tuple = namedtuple("Span", ["text", "start_char_idx", "end_char_idx"]) @@ -25,10 +26,13 @@ except OSError: nlp = None +DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu' + try: + import flair from flair.data import Sentence from flair.models import SequenceTagger - + flair.device = torch.device(DEVICE) tagger_fast = SequenceTagger.load("ner-ontonotes-fast") except ImportError: tagger_fast = None @@ -288,7 +292,7 @@ def my_mention_extractor( """ sentence = Sentence(text) - tagger_fast.predict(sentence, mini_batch_size=32) + tagger_fast.predict(sentence) entities = [] org_entities=[] per_entities=[] @@ -322,14 +326,14 @@ def my_mention_extractor( str(sentence.to_dict(tag_type="ner")["entities"][i]["labels"][0]).split()[0] in "GPE"): loc_value = str(sentence.to_dict(tag_type="ner")["entities"][i]["text"]) - loc_entities.append([loc_value]) + loc_entities.append(loc_value) if is_org: org_text_entity = str_main - org_entities.append([org_text_entity,start_pos,end_pos]) + org_entities.append(org_text_entity) type_entities.append(["ORG",start_pos,end_pos]) if is_per: per_text_entity = str_main - per_entities.append([per_text_entity,start_pos,end_pos]) + per_entities.append(per_text_entity) type_entities.append(["PER",start_pos,end_pos]) if str_main is not None and (start_pos != -1 and end_pos != -1): final_gram = None From 15103902d03033e4ba3473fbc733e12b4f1b3149 Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Fri, 10 Jun 2022 16:38:55 +0530 Subject: [PATCH 7/8] Bootleg changes --- bootleg/end2end/bootleg_annotator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bootleg/end2end/bootleg_annotator.py b/bootleg/end2end/bootleg_annotator.py index d5538435..2bdca0f9 100644 --- a/bootleg/end2end/bootleg_annotator.py +++ b/bootleg/end2end/bootleg_annotator.py @@ -144,7 +144,6 @@ class BootlegAnnotator(object): cache_dir: cache directory (default None) model_name: model name (default None) entity_emb_file: entity embedding file (default None) - entity_dir_loc: entity directory location (default None) return_embs: whether to return embeddings or not (default False) extract_method: mention extraction method verbose: verbose boolean (default False) @@ -160,7 +159,6 @@ def __init__( cache_dir: str = None, model_name: str = None, entity_emb_file: str = None, - entity_dir_loc: str=None, return_embs: bool = False, extract_method: str = "ngram_spacy", verbose: bool = False, @@ -173,7 +171,6 @@ def __init__( self.return_embs = return_embs self.entity_emb_file = entity_emb_file self.extract_method = extract_method - self.entity_dir_location = entity_dir_loc if self.entity_emb_file is not None: assert Path( @@ -254,7 +251,7 @@ def __init__( edit_mode=False ) self.entity_profile = EntityProfile.load_from_cache(\ - load_dir=self.entity_dir_location,\ + load_dir=self.config.data_config.entity_dir,\ no_type=True,edit_mode=False,\ verbose=True) self.all_aliases_trie = self.entity_db.get_all_alias_vocabtrie() From f2eb6bea1e76f74d2034bbf325815a2f51b5aa5a Mon Sep 17 00:00:00 2001 From: shounak-kundu Date: Tue, 14 Jun 2022 08:15:42 +0530 Subject: [PATCH 8/8] Minor typos --- bootleg/end2end/bootleg_annotator.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/bootleg/end2end/bootleg_annotator.py b/bootleg/end2end/bootleg_annotator.py index 2bdca0f9..4b979182 100644 --- a/bootleg/end2end/bootleg_annotator.py +++ b/bootleg/end2end/bootleg_annotator.py @@ -17,7 +17,6 @@ from emmental.model import EmmentalModel from tqdm.auto import tqdm from transformers import AutoTokenizer - from bootleg.dataset import extract_context, get_entity_string from bootleg.end2end.annotator_utils import DownloadProgressBar from bootleg.end2end.extract_mentions import MENTION_EXTRACTOR_OPTIONS @@ -40,7 +39,6 @@ } - def get_default_cache(): """Get default cache directory for saving Bootleg data.""" try: @@ -56,7 +54,6 @@ def get_default_cache(): ) return Path(torch_cache_home) / "bootleg" - def create_config(model_path, data_path, model_name): """Create Bootleg config. @@ -90,7 +87,6 @@ def create_config(model_path, data_path, model_name): config_args = parse_boot_and_emm_args(config_args) return config_args - def create_sources(model_path, data_path, model_name): """Download Bootleg data and saves in log dir. @@ -250,10 +246,10 @@ def __init__( alias_idx_dir=self.config.data_config.alias_idx_map, edit_mode=False ) - self.entity_profile = EntityProfile.load_from_cache(\ - load_dir=self.config.data_config.entity_dir,\ - no_type=True,edit_mode=False,\ - verbose=True) + # self.entity_profile = EntityProfile.load_from_cache(\ + # load_dir=self.config.data_config.entity_dir,\ + # no_type=True,edit_mode=False,\ + # verbose=True) self.all_aliases_trie = self.entity_db.get_all_alias_vocabtrie() add_entity_type = self.config.data_config.entity_type_data.use_entity_types @@ -270,14 +266,14 @@ def __init__( add_entity_kg = self.config.data_config.entity_kg_data.use_entity_kg self.kg_symbols = None # If we do not have self.entity_emb_file, then need to generate entity encoder input with metadata - if add_entity_kg and self.entity_emb_file is None: - logger.debug("Reading entity kg database") - self.kg_symbols = KGSymbols.load_from_cache( - os.path.join( - self.config.data_config.entity_dir, - self.config.data_config.entity_kg_data.kg_symbols_dir, - ) + # if add_entity_kg and self.entity_emb_file is None: + logger.debug("Reading entity kg database") + self.kg_symbols = KGSymbols.load_from_cache( + os.path.join( + self.config.data_config.entity_dir, + self.config.data_config.entity_kg_data.kg_symbols_dir, ) + ) logger.debug("Reading word tokenizers") self.tokenizer = AutoTokenizer.from_pretrained( self.config.data_config.word_embedding.bert_model, @@ -624,7 +620,7 @@ def label_mentions( final_all_cands[idx_unq].append(entity_cands) final_cand_probs[idx_unq].append(probs_ex) final_pred_cands[idx_unq].append(pred_qid) - entity_relation_dict=self.entity_profile.get_relations_tails_for_qid(pred_qid) + entity_relation_dict=self.kg_symbols.get_relations_tails_for_qid(pred_qid) if 'instance of' in entity_relation_dict: instance_of_list = entity_relation_dict['instance of'] if 'Q5' not in instance_of_list: ## Q5 means human