vi3k6i5 · Drxan · Nov 10, 2017 · Nov 10, 2017 · Nov 19, 2017 · Nov 19, 2017
diff --git a/.coveragerc b/.coveragerc
@@ -1,2 +1,4 @@
 [run]
-omit = test/*
+omit = 
+    test/*
+    setup.py
diff --git a/README.rst b/README.rst
@@ -67,6 +67,24 @@ Case Sensitive example
     >>> keywords_found
     >>> # ['Bay Area']
 
+Span of keywords extracted
+    >>> from flashtext import KeywordProcessor
+    >>> keyword_processor = KeywordProcessor()
+    >>> keyword_processor.add_keyword('Big Apple', 'New York')
+    >>> keyword_processor.add_keyword('Bay Area')
+    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)
+    >>> keywords_found
+    >>> # [('New York', 7, 16), ('Bay Area', 21, 29)]
+
+Get Extra information with keywords extracted
+    >>> from flashtext import KeywordProcessor
+    >>> kp = KeywordProcessor()
+    >>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))
+    >>> kp.add_keyword('Delhi', ('Location', 'Delhi'))
+    >>> kp.extract_keywords('Taj Mahal is in Delhi.')
+    >>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
+    >>> # NOTE: replace_keywords feature won't work with this.
+
 No clean name for Keywords
     >>> from flashtext import KeywordProcessor
     >>> keyword_processor = KeywordProcessor()
@@ -134,9 +152,9 @@ Get all keywords in dictionary
     >>> from flashtext import KeywordProcessor
     >>> keyword_processor = KeywordProcessor()
     >>> keyword_processor.add_keyword('j2ee', 'Java')
-    >>> keyword_processor.add_keyword('onGoing', 'rendom')
+    >>> keyword_processor.add_keyword('colour', 'color')
     >>> keyword_processor.get_all_keywords()
-    >>> # output: {'j2ee': 'Java', 'ongoing': 'rendom'}
+    >>> # output: {'colour': 'color', 'j2ee': 'Java'}
 
 For detecting Word Boundary currently any character other than this `\\w` `[A-Za-z0-9_]` is considered a word boundary.
 
@@ -199,11 +217,27 @@ The idea for this library came from the following `StackOverflow question
 <https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.
 
 
-References
+Citation
 ----------
 
 The original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.
 
+::
+
+    @ARTICLE{2017arXiv171100046S,
+       author = {{Singh}, V.},
+        title = "{Replace or Retrieve Keywords In Documents at Scale}",
+      journal = {ArXiv e-prints},
+    archivePrefix = "arXiv",
+       eprint = {1711.00046},
+     primaryClass = "cs.DS",
+     keywords = {Computer Science - Data Structures and Algorithms},
+         year = 2017,
+        month = oct,
+       adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},
+      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+    }
+
 The article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.
 
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -64,6 +64,24 @@ Case Sensitive example
     >>> keywords_found
     >>> # ['Bay Area']
 
+Span of keywords extracted
+    >>> from flashtext import KeywordProcessor
+    >>> keyword_processor = KeywordProcessor()
+    >>> keyword_processor.add_keyword('Big Apple', 'New York')
+    >>> keyword_processor.add_keyword('Bay Area')
+    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)
+    >>> keywords_found
+    >>> # [('New York', 7, 16), ('Bay Area', 21, 29)]
+
+Get Extra information with keywords extracted
+    >>> from flashtext import KeywordProcessor
+    >>> kp = KeywordProcessor()
+    >>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))
+    >>> kp.add_keyword('Delhi', ('Location', 'Delhi'))
+    >>> kp.extract_keywords('Taj Mahal is in Delhi.')
+    >>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
+    >>> # NOTE: replace_keywords feature won't work with this.
+
 No clean name for Keywords
     >>> from flashtext import KeywordProcessor
     >>> keyword_processor = KeywordProcessor()
@@ -131,9 +149,9 @@ Get all keywords in dictionary
     >>> from flashtext import KeywordProcessor
     >>> keyword_processor = KeywordProcessor()
     >>> keyword_processor.add_keyword('j2ee', 'Java')
-    >>> keyword_processor.add_keyword('onGoing', 'rendom')
+    >>> keyword_processor.add_keyword('colour', 'color')
     >>> keyword_processor.get_all_keywords()
-    >>> # output: {'j2ee': 'Java', 'ongoing': 'rendom'}
+    >>> # output: {'colour': 'color', 'j2ee': 'Java'}
 
 For detecting Word Boundary currently any character other than this `\\w` `[A-Za-z0-9_]` is considered a word boundary.
 
@@ -207,11 +225,27 @@ The idea for this library came from the following `StackOverflow question
 <https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.
 
 
-References
+Citation
 ----------
 
 The original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.
 
+::
+
+    @ARTICLE{2017arXiv171100046S,
+       author = {{Singh}, V.},
+        title = "{Replace or Retrieve Keywords In Documents at Scale}",
+      journal = {ArXiv e-prints},
+    archivePrefix = "arXiv",
+       eprint = {1711.00046},
+     primaryClass = "cs.DS",
+     keywords = {Computer Science - Data Structures and Algorithms},
+         year = 2017,
+        month = oct,
+       adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},
+      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+    }
+
 The article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.
 
 

diff --git a/flashtext/keyword.py b/flashtext/keyword.py
@@ -1,5 +1,6 @@
 import os
 import string
+import io
 
 
 class KeywordProcessor(object):
@@ -148,7 +149,12 @@ def __setitem__(self, keyword, clean_name=None):
             if self._keyword not in current_dict:
                 status = True
                 self._terms_in_trie += 1
-            current_dict[self._keyword] = clean_name
+                current_dict[self._keyword] = clean_name
+            else:
+                status = True
+                clean_names = set(current_dict[self._keyword].split('_|_'))
+                clean_names.add(clean_name)
+                current_dict[self._keyword] = '_|_'.join(clean_names)
         return status
 
     def __delitem__(self, keyword):
@@ -173,8 +179,12 @@ def __delitem__(self, keyword):
                 if letter in current_dict:
                     character_trie_list.append((letter, current_dict))
                     current_dict = current_dict[letter]
-            # remove the charactes from trie dict if there are no other keywords with them
-            if self._keyword in current_dict:
+                else:
+                    # if character is not found, break out of the loop
+                    current_dict = None
+                    break
+            # remove the characters from trie dict if there are no other keywords with them
+            if current_dict and self._keyword in current_dict:
                 # we found a complete match for input keyword.
                 character_trie_list.append((self._keyword, current_dict))
                 character_trie_list.reverse()
@@ -283,11 +293,12 @@ def get_keyword(self, word):
         """
         return self.__getitem__(word)
 
-    def add_keyword_from_file(self, keyword_file):
+    def add_keyword_from_file(self, keyword_file, encoding="utf-8"):
         """To add keywords from a file
 
         Args:
             keyword_file : path to keywords file
+            encoding : specify the encoding of the file
 
         Examples:
             keywords file format can be like:
@@ -311,7 +322,7 @@ def add_keyword_from_file(self, keyword_file):
         """
         if not os.path.isfile(keyword_file):
             raise IOError("Invalid file path {}".format(keyword_file))
-        with open(keyword_file)as f:
+        with io.open(keyword_file, encoding=encoding) as f:
             for line in f:
                 if '=>' in line:
                     keyword, clean_name = line.split('=>')
@@ -381,7 +392,7 @@ def add_keywords_from_list(self, keyword_list):
 
         """
         if not isinstance(keyword_list, list):
-                raise AttributeError("keyword_list should be a list")
+            raise AttributeError("keyword_list should be a list")
 
         for keyword in keyword_list:
             self.add_keyword(keyword)
@@ -410,7 +421,7 @@ def get_all_keywords(self, term_so_far='', current_dict=None):
 
         Args:
             term_so_far : string
-                term built so far by adding all previous charactes
+                term built so far by adding all previous characters
             current_dict : dict
                 current recursive position in dictionary
 
@@ -441,7 +452,7 @@ def get_all_keywords(self, term_so_far='', current_dict=None):
                     terms_present[key] = sub_values[key]
         return terms_present
 
-    def extract_keywords(self, sentence):
+    def extract_keywords(self, sentence, span_info=False):
         """Searches in the string for all keywords present in corpus.
         Keywords present are added to a list `keywords_extracted` and returned.
 
@@ -468,7 +479,9 @@ def extract_keywords(self, sentence):
         if not self.case_sensitive:
             sentence = sentence.lower()
         current_dict = self.keyword_trie_dict
+        sequence_start_pos = 0
         sequence_end_pos = 0
+        reset_current_dict = False
         idx = 0
         sentence_len = len(sentence)
         while idx < sentence_len:
@@ -515,17 +528,19 @@ def extract_keywords(self, sentence):
                             idx = sequence_end_pos
                     current_dict = self.keyword_trie_dict
                     if longest_sequence_found:
-                        keywords_extracted.append(longest_sequence_found)
-
+                        keywords_extracted.append((longest_sequence_found, sequence_start_pos, idx))
+                    reset_current_dict = True
                 else:
                     # we reset current_dict
                     current_dict = self.keyword_trie_dict
+                    reset_current_dict = True
             elif char in current_dict:
                 # we can continue from this char
                 current_dict = current_dict[char]
             else:
                 # we reset current_dict
                 current_dict = self.keyword_trie_dict
+                reset_current_dict = True
                 # skip to end of word
                 idy = idx + 1
                 while idy < sentence_len:
@@ -538,9 +553,14 @@ def extract_keywords(self, sentence):
             if idx + 1 >= sentence_len:
                 if self._keyword in current_dict:
                     sequence_found = current_dict[self._keyword]
-                    keywords_extracted.append(sequence_found)
+                    keywords_extracted.append((sequence_found, sequence_start_pos, sentence_len))
             idx += 1
-        return keywords_extracted
+            if reset_current_dict:
+                reset_current_dict = False
+                sequence_start_pos = idx
+        if span_info:
+            return keywords_extracted
+        return [value[0] for value in keywords_extracted]
 
     def replace_keywords(self, sentence):
         """Searches in the string for all keywords present in corpus.
@@ -565,7 +585,7 @@ def replace_keywords(self, sentence):
         if not sentence:
             # if sentence is empty or none just return the same.
             return sentence
-        new_sentence = ''
+        new_sentence = []
         orig_sentence = sentence
         if not self.case_sensitive:
             sentence = sentence.lower()
@@ -624,17 +644,17 @@ def replace_keywords(self, sentence):
                             current_word = current_word_continued
                     current_dict = self.keyword_trie_dict
                     if longest_sequence_found:
-                        new_sentence += longest_sequence_found + current_white_space
+                        new_sentence.append(longest_sequence_found + current_white_space)
                         current_word = ''
                         current_white_space = ''
                     else:
-                        new_sentence += current_word
+                        new_sentence.append(current_word)
                         current_word = ''
                         current_white_space = ''
                 else:
                     # we reset current_dict
                     current_dict = self.keyword_trie_dict
-                    new_sentence += current_word
+                    new_sentence.append(current_word)
                     current_word = ''
                     current_white_space = ''
             elif char in current_dict:
@@ -652,13 +672,15 @@ def replace_keywords(self, sentence):
                         break
                     idy += 1
                 idx = idy
-                new_sentence += current_word
+                new_sentence.append(current_word)
                 current_word = ''
                 current_white_space = ''
             # if we are end of sentence and have a sequence discovered
             if idx + 1 >= sentence_len:
                 if self._keyword in current_dict:
                     sequence_found = current_dict[self._keyword]
-                    new_sentence += sequence_found
+                    new_sentence.append(sequence_found)
+                else:
+                    new_sentence.append(current_word)
             idx += 1
-        return new_sentence
+        return "".join(new_sentence)
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@ def run(self):
         raise SystemExit(errno)
 
 name = 'flashtext'
-version = '2.4'
+version = '2.7'
 
 cmdclass = {'test': PyTest}