Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
3654f79
Merge pull request #12 from vi3k6i5/develop
vi3k6i5 Nov 10, 2017
105b24a
Merge pull request #13 from vi3k6i5/develop
vi3k6i5 Nov 10, 2017
43e972a
Merge pull request #25 from vi3k6i5/develop
vi3k6i5 Nov 19, 2017
86f01ef
Added test case to get all keywords from kp
vi3k6i5 Nov 19, 2017
92ebec9
added test case to check term in kp
vi3k6i5 Nov 19, 2017
b5bf2aa
added test case to check term in kp
vi3k6i5 Nov 19, 2017
f2440b9
added test case coverage
vi3k6i5 Nov 19, 2017
5478cfd
Merge pull request #26 from vi3k6i5/develop
vi3k6i5 Nov 19, 2017
90d3cd1
added citation
vi3k6i5 Nov 21, 2017
06066e0
Merge pull request #27 from vi3k6i5/develop
vi3k6i5 Nov 21, 2017
ae2d85d
correccted citation
vi3k6i5 Nov 21, 2017
9adde82
Merge pull request #28 from vi3k6i5/develop
vi3k6i5 Nov 21, 2017
7090989
added feature to return span_info when using extract_keywords
vi3k6i5 Nov 21, 2017
77d6bc7
added docs for span_info
vi3k6i5 Nov 21, 2017
f18c3c8
updated version to 2.5
vi3k6i5 Nov 21, 2017
2255b50
Merge pull request #29 from vi3k6i5/develop
vi3k6i5 Nov 21, 2017
a966304
imporved example of span_info
vi3k6i5 Nov 21, 2017
1c970b5
added example for getting extra info with extract keywords
vi3k6i5 Dec 7, 2017
8adc57d
Fix typos:
delirious-lettuce Dec 15, 2017
c8a01ae
Merge pull request #41 from delirious-lettuce/fix_typos
vi3k6i5 Dec 18, 2017
eb2c6ca
added fix for encoding of file
vi3k6i5 Jan 19, 2018
4d1ed19
Merge branch 'master' of github.com:vi3k6i5/flashtext
vi3k6i5 Jan 19, 2018
599a836
Fix issue with incomplete keyword at the end of the sentence
killfactory Jan 21, 2018
dec72ad
added comment for encoding parameter
vi3k6i5 Jan 26, 2018
33355ce
Merge pull request #45 from killfactory/master
vi3k6i5 Jan 26, 2018
5591859
added bug fix for https://github.com/vi3k6i5/flashtext/issues/47
vi3k6i5 Feb 16, 2018
ac25fc3
Performances improvement for string manipulation
rompom Jun 21, 2018
50c45f1
Merge pull request #55 from rompom/master
vi3k6i5 Nov 9, 2018
f982ba4
set the _keyword=set() for some keyword has multiple different clean_…
Drxan Dec 20, 2018
5b4d8cd
Some key_words have multiple different clean_names
Drxan Dec 20, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
[run]
omit = test/*
omit =
test/*
setup.py
40 changes: 37 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,24 @@ Case Sensitive example
>>> keywords_found
>>> # ['Bay Area']

Span of keywords extracted
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor.add_keyword('Bay Area')
>>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)
>>> keywords_found
>>> # [('New York', 7, 16), ('Bay Area', 21, 29)]

Get Extra information with keywords extracted
>>> from flashtext import KeywordProcessor
>>> kp = KeywordProcessor()
>>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))
>>> kp.add_keyword('Delhi', ('Location', 'Delhi'))
>>> kp.extract_keywords('Taj Mahal is in Delhi.')
>>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
>>> # NOTE: replace_keywords feature won't work with this.

No clean name for Keywords
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
Expand Down Expand Up @@ -134,9 +152,9 @@ Get all keywords in dictionary
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('j2ee', 'Java')
>>> keyword_processor.add_keyword('onGoing', 'rendom')
>>> keyword_processor.add_keyword('colour', 'color')
>>> keyword_processor.get_all_keywords()
>>> # output: {'j2ee': 'Java', 'ongoing': 'rendom'}
>>> # output: {'colour': 'color', 'j2ee': 'Java'}

For detecting Word Boundary currently any character other than this `\\w` `[A-Za-z0-9_]` is considered a word boundary.

Expand Down Expand Up @@ -199,11 +217,27 @@ The idea for this library came from the following `StackOverflow question
<https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.


References
Citation
----------

The original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.

::

@ARTICLE{2017arXiv171100046S,
author = {{Singh}, V.},
title = "{Replace or Retrieve Keywords In Documents at Scale}",
journal = {ArXiv e-prints},
archivePrefix = "arXiv",
eprint = {1711.00046},
primaryClass = "cs.DS",
keywords = {Computer Science - Data Structures and Algorithms},
year = 2017,
month = oct,
adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
}

The article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.


Expand Down
40 changes: 37 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,24 @@ Case Sensitive example
>>> keywords_found
>>> # ['Bay Area']

Span of keywords extracted
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor.add_keyword('Bay Area')
>>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)
>>> keywords_found
>>> # [('New York', 7, 16), ('Bay Area', 21, 29)]

Get Extra information with keywords extracted
>>> from flashtext import KeywordProcessor
>>> kp = KeywordProcessor()
>>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))
>>> kp.add_keyword('Delhi', ('Location', 'Delhi'))
>>> kp.extract_keywords('Taj Mahal is in Delhi.')
>>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
>>> # NOTE: replace_keywords feature won't work with this.

No clean name for Keywords
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
Expand Down Expand Up @@ -131,9 +149,9 @@ Get all keywords in dictionary
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('j2ee', 'Java')
>>> keyword_processor.add_keyword('onGoing', 'rendom')
>>> keyword_processor.add_keyword('colour', 'color')
>>> keyword_processor.get_all_keywords()
>>> # output: {'j2ee': 'Java', 'ongoing': 'rendom'}
>>> # output: {'colour': 'color', 'j2ee': 'Java'}

For detecting Word Boundary currently any character other than this `\\w` `[A-Za-z0-9_]` is considered a word boundary.

Expand Down Expand Up @@ -207,11 +225,27 @@ The idea for this library came from the following `StackOverflow question
<https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.


References
Citation
----------

The original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.

::

@ARTICLE{2017arXiv171100046S,
author = {{Singh}, V.},
title = "{Replace or Retrieve Keywords In Documents at Scale}",
journal = {ArXiv e-prints},
archivePrefix = "arXiv",
eprint = {1711.00046},
primaryClass = "cs.DS",
keywords = {Computer Science - Data Structures and Algorithms},
year = 2017,
month = oct,
adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
}

The article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.


Expand Down
60 changes: 41 additions & 19 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import string
import io


class KeywordProcessor(object):
Expand Down Expand Up @@ -148,7 +149,12 @@ def __setitem__(self, keyword, clean_name=None):
if self._keyword not in current_dict:
status = True
self._terms_in_trie += 1
current_dict[self._keyword] = clean_name
current_dict[self._keyword] = clean_name
else:
status = True
clean_names = set(current_dict[self._keyword].split('_|_'))
clean_names.add(clean_name)
current_dict[self._keyword] = '_|_'.join(clean_names)
return status

def __delitem__(self, keyword):
Expand All @@ -173,8 +179,12 @@ def __delitem__(self, keyword):
if letter in current_dict:
character_trie_list.append((letter, current_dict))
current_dict = current_dict[letter]
# remove the charactes from trie dict if there are no other keywords with them
if self._keyword in current_dict:
else:
# if character is not found, break out of the loop
current_dict = None
break
# remove the characters from trie dict if there are no other keywords with them
if current_dict and self._keyword in current_dict:
# we found a complete match for input keyword.
character_trie_list.append((self._keyword, current_dict))
character_trie_list.reverse()
Expand Down Expand Up @@ -283,11 +293,12 @@ def get_keyword(self, word):
"""
return self.__getitem__(word)

def add_keyword_from_file(self, keyword_file):
def add_keyword_from_file(self, keyword_file, encoding="utf-8"):
"""To add keywords from a file

Args:
keyword_file : path to keywords file
encoding : specify the encoding of the file

Examples:
keywords file format can be like:
Expand All @@ -311,7 +322,7 @@ def add_keyword_from_file(self, keyword_file):
"""
if not os.path.isfile(keyword_file):
raise IOError("Invalid file path {}".format(keyword_file))
with open(keyword_file)as f:
with io.open(keyword_file, encoding=encoding) as f:
for line in f:
if '=>' in line:
keyword, clean_name = line.split('=>')
Expand Down Expand Up @@ -381,7 +392,7 @@ def add_keywords_from_list(self, keyword_list):

"""
if not isinstance(keyword_list, list):
raise AttributeError("keyword_list should be a list")
raise AttributeError("keyword_list should be a list")

for keyword in keyword_list:
self.add_keyword(keyword)
Expand Down Expand Up @@ -410,7 +421,7 @@ def get_all_keywords(self, term_so_far='', current_dict=None):

Args:
term_so_far : string
term built so far by adding all previous charactes
term built so far by adding all previous characters
current_dict : dict
current recursive position in dictionary

Expand Down Expand Up @@ -441,7 +452,7 @@ def get_all_keywords(self, term_so_far='', current_dict=None):
terms_present[key] = sub_values[key]
return terms_present

def extract_keywords(self, sentence):
def extract_keywords(self, sentence, span_info=False):
"""Searches in the string for all keywords present in corpus.
Keywords present are added to a list `keywords_extracted` and returned.

Expand All @@ -468,7 +479,9 @@ def extract_keywords(self, sentence):
if not self.case_sensitive:
sentence = sentence.lower()
current_dict = self.keyword_trie_dict
sequence_start_pos = 0
sequence_end_pos = 0
reset_current_dict = False
idx = 0
sentence_len = len(sentence)
while idx < sentence_len:
Expand Down Expand Up @@ -515,17 +528,19 @@ def extract_keywords(self, sentence):
idx = sequence_end_pos
current_dict = self.keyword_trie_dict
if longest_sequence_found:
keywords_extracted.append(longest_sequence_found)

keywords_extracted.append((longest_sequence_found, sequence_start_pos, idx))
reset_current_dict = True
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
reset_current_dict = True
elif char in current_dict:
# we can continue from this char
current_dict = current_dict[char]
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
reset_current_dict = True
# skip to end of word
idy = idx + 1
while idy < sentence_len:
Expand All @@ -538,9 +553,14 @@ def extract_keywords(self, sentence):
if idx + 1 >= sentence_len:
if self._keyword in current_dict:
sequence_found = current_dict[self._keyword]
keywords_extracted.append(sequence_found)
keywords_extracted.append((sequence_found, sequence_start_pos, sentence_len))
idx += 1
return keywords_extracted
if reset_current_dict:
reset_current_dict = False
sequence_start_pos = idx
if span_info:
return keywords_extracted
return [value[0] for value in keywords_extracted]

def replace_keywords(self, sentence):
"""Searches in the string for all keywords present in corpus.
Expand All @@ -565,7 +585,7 @@ def replace_keywords(self, sentence):
if not sentence:
# if sentence is empty or none just return the same.
return sentence
new_sentence = ''
new_sentence = []
orig_sentence = sentence
if not self.case_sensitive:
sentence = sentence.lower()
Expand Down Expand Up @@ -624,17 +644,17 @@ def replace_keywords(self, sentence):
current_word = current_word_continued
current_dict = self.keyword_trie_dict
if longest_sequence_found:
new_sentence += longest_sequence_found + current_white_space
new_sentence.append(longest_sequence_found + current_white_space)
current_word = ''
current_white_space = ''
else:
new_sentence += current_word
new_sentence.append(current_word)
current_word = ''
current_white_space = ''
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
new_sentence += current_word
new_sentence.append(current_word)
current_word = ''
current_white_space = ''
elif char in current_dict:
Expand All @@ -652,13 +672,15 @@ def replace_keywords(self, sentence):
break
idy += 1
idx = idy
new_sentence += current_word
new_sentence.append(current_word)
current_word = ''
current_white_space = ''
# if we are end of sentence and have a sequence discovered
if idx + 1 >= sentence_len:
if self._keyword in current_dict:
sequence_found = current_dict[self._keyword]
new_sentence += sequence_found
new_sentence.append(sequence_found)
else:
new_sentence.append(current_word)
idx += 1
return new_sentence
return "".join(new_sentence)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def run(self):
raise SystemExit(errno)

name = 'flashtext'
version = '2.4'
version = '2.7'

cmdclass = {'test': PyTest}

Expand Down
Loading