-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsecond_chatbot.py
More file actions
executable file
·92 lines (74 loc) · 3.64 KB
/
second_chatbot.py
File metadata and controls
executable file
·92 lines (74 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
import nltk
import numpy as np
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# extend stop words
from sklearn.feature_extraction import text
# # disable SSL check no goood
# import ssl
# try:
# _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
# pass
# else:
# ssl._create_default_https_context = _create_unverified_https_context
from corpusreader import read_corpus
from corpusreader import corpus_keyword_detector
raw_doc=read_corpus(filename='tech_corpus.txt')
corpus_keywords = corpus_keyword_detector(filename='tech_corpus.txt')
# raw_doc=read_corpus(image=True)
# corpus_keywords = corpus_keyword_detector(image=True)
article_sentences = nltk.sent_tokenize(raw_doc)
article_words = nltk.word_tokenize(raw_doc)
# remove the punctuation from the user input text and will also lemmatize the text
wnlemmatizer = nltk.stem.WordNetLemmatizer()
def perform_lemmatization(tokens):
"""take a list of words as input and lemmatize the corresponding lemmatized list of words"""
return [wnlemmatizer.lemmatize(token) for token in tokens]
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)
def get_processed_text(document):
"""removes the punctuation from the passed text"""
return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))
greeting_inputs = ("hey", "good morning", "good evening", "morning", "evening", "hi", "whatsup")
greeting_responses = ["hey", "hey hows you?", "*nods*", "hello, how you doing"]
def generate_greeting_response(greeting):
for token in greeting.split():
if token.lower() in greeting_inputs:
return random.choice(greeting_responses)
def generate_response(user_input):
RuleRobo_response = ''
article_sentences.append(user_input)
my_additional_stop_words = ['-',';','.',':','!','?','ha', 'le', 'u', 'wa']
word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words=text.ENGLISH_STOP_WORDS.union(my_additional_stop_words))
all_word_vectors = word_vectorizer.fit_transform(article_sentences)
similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
similar_sentence_number = similar_vector_values.argsort()[0][-2]
matched_vector = similar_vector_values.flatten()
matched_vector.sort()
vector_matched = matched_vector[-2]
if vector_matched == 0:
return f'{RuleRobo_response} could you please rephrase it...my domain knowledge is limited to {",".join(corpus_keywords)}'
else:
RuleRobo_response = RuleRobo_response + article_sentences[similar_sentence_number]
return RuleRobo_response
continue_dialogue = True
print(f'\033[1;32;40m RuleRobo. Ask me any corpus related question...my domain knowledge is limited to {",".join(corpus_keywords)} \033[0m')
while(continue_dialogue == True):
human_text = input().lower()
if human_text != 'bye':
if human_text == 'thanks' or human_text == 'thank you very much' or human_text == 'thank you':
continue_dialogue = False
print("\033[1;32;40m RuleRobo: Most welcome")
else:
if generate_greeting_response(human_text) != None:
print("\033[1;32;40m RuleRobo: " + generate_greeting_response(human_text))
else:
print("\033[1;32;40m RuleRobo: ", end="")
print(generate_response(human_text))
article_sentences.remove(human_text)
else:
continue_dialogue = False
print("\033[1;32;40m RuleRobo: Good bye...")