-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfunctions.py
More file actions
586 lines (501 loc) · 23.9 KB
/
functions.py
File metadata and controls
586 lines (501 loc) · 23.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
#%% LIBRARIES
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, nlargest
from collections import OrderedDict
import pickle
pd.options.mode.chained_assignment = None
#%% DATA CLEANING
def clean(airbnb_data):
"""
Method that removes nan values and imputes them
Input: dataframe
Output: cleaned dataframe
"""
#replace NAN values with 0 for average_rate_per_night column
airbnb_data.average_rate_per_night.replace(np.nan, '$0',inplace=True)
#convert to int average_rate_per_night and remove $
airbnb_data.average_rate_per_night=airbnb_data.average_rate_per_night.replace('[\$]', '', regex=True).astype(int)
#replace NAN values with'unknown' for description, title and latitude and longitude
airbnb_data.description.replace(np.nan,'unknown',inplace=True)
airbnb_data.title.replace(np.nan,'unknown',inplace=True)
airbnb_data.latitude.replace(np.nan,'unknown',inplace=True)
airbnb_data.longitude.replace(np.nan,'unknown',inplace=True)
#check where bedrooms_count doesn't have a value and save indexes of those records to a list
null_value_idx=airbnb_data[airbnb_data.bedrooms_count.isnull()].index
#if the word studio is mentioned in the description then it is a studio otherwise 'unknown'
for idx in null_value_idx:
if 'studio' in airbnb_data.iloc[idx].description.split():
airbnb_data.bedrooms_count[idx]='Studio'
else:
airbnb_data.bedrooms_count[idx]='unknown'
#remove duplicate houses based on the url
airbnb_data.url=airbnb_data.url.apply(lambda x:x.split('?')[0])
airbnb_data.drop_duplicates(subset='url',inplace=True)
return airbnb_data
#%% Making of .tsv files
def create_tsv_documents(airbnb_data):
"""
Method that creates different .tsv files for each record in the airbnb_data
Input: dataframe
"""
#clean data
airbnb_data=clean(airbnb_data)
#for each index make a dataframe of airbnb_data and store it into new tsv file
for i in airbnb_data.index:
pd.DataFrame(airbnb_data.loc[i]).transpose().to_csv('data/doc_'+str(i)+'.tsv',sep='\t')
#%% Making of vocabulary
def preprocessing_text(df):
"""
Method that returns filtered words from the text input
Input: string(text)
Output: list(bag of words)
"""
#remove upper cases
df=df.lower()
#replacing new line sign '\n' with a whitespace ' '
df=df.replace('\\n',' ')
#for removing stop words
stop_words = set(stopwords.words('english'))
#for removing punctuations
tokenizer = RegexpTokenizer(r'\w+')
#to tokenize the string
word_tokens = tokenizer.tokenize(df)
#stemming
ps = PorterStemmer()
filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words]
return filtered_words
def build_vocabulary(airbnb_data):
"""
Method that creates vocabulary
Input: dataframe in order to access number of files made by that airbnb dataframe
Output: vocabulary list and doc_vocabs(dictionary, key='doc_id',value=list of unique words belonging to that document)
"""
#list for vocabulary
vocabulary_lst=[]
#building a dictionary which will be used for making an inverted index
doc_vocabs=defaultdict(list)
for i in airbnb_data.index:
#take one file
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
#preprocessing description and title
df=df.description[0]+' '+df.title[0]
filtered_words=preprocessing_text(df)
#temporary variable set used for making vocabulary with unique words
temp_vocabulary_set=set()
for word in filtered_words:
temp_vocabulary_set.add(word)
vocabulary_lst.append(temp_vocabulary_set)
doc_vocabs[i]=list(temp_vocabulary_set)
#union of content of vocabulary_lst
vocabulary_set=set.union(*vocabulary_lst)
#mapping words into integers
vocabulary={}
for k,v in enumerate(vocabulary_set):
vocabulary[v]= k
return vocabulary,doc_vocabs
def save_vocabulary(vocabulary,file_name):
"""
method that converts vocabulary into a dataframe and saves it into a csv file
input: vocabulary(dictionary, key='term',value='term_id')
"""
vocabulary_dataframe=pd.DataFrame()
vocabulary_dataframe['word']=vocabulary.keys()
vocabulary_dataframe.to_csv(str(file_name)+'.csv')
#%% First search engine with a conjunctive query
def finalize_output(result_set):
"""
method that is used for creating the result dataframe with the columns 'title','description','city','url'
Input: result_set - list of document indices
Output: df - result dataframe
"""
#initialization od result df
df=pd.DataFrame()
#iterate through result document indices
for i,val in enumerate(result_set):
pd.set_option('display.max_colwidth', -1)
df=df.append(pd.read_csv('data/doc_'+str(val)+'.tsv',sep='\t',usecols=['description','title','city','url']
,encoding='ISO-8859-1'))
#reset index
df.reset_index(inplace=True)
#drop column "index" which appears when index is being reseted cause previous index
#becomes new column named "index"
df.drop('index',axis=1,inplace=True)
#return columns in this order
df=df[['title','description','city','url']]
return df
def search_engine(vocabulary,inverted_idx):
"""
method that prints the result dataframe with the columns 'title','description','city','url'
based on user query
Input: vocabulary-dictionary of all words,(key='term',value='term_id')
inverted_idx-dictionary(key='term_id',value=list of doument id's containing that term)
Output: doc_id_lst-list of document id's which are result documents of the query
result_set-dataframe which is result of the query presented to the user
"""
user_query=str(input())
#preprocess text user inputed(same process like in making of vocabulary)
user_query=preprocessing_text(user_query)
list_term_idx=[]#list of sets of doc_ids containing inputed words,for each word one set
result_set=[]
for word in user_query:
#if word exist in the vocabulary
if word in vocabulary.keys():
list_term_idx.append(set(inverted_idx[vocabulary[word]]))
else:
list_term_idx.append({'x'})
break
#intersection of sets containing doc_ids
result_set=list(set.intersection(*list_term_idx))
doc_id_lst=result_set
#if intersection is empty set end the method
if 'x' in result_set or len(result_set) == 0:
result_set='No results! Try again!'
print(result_set)
return doc_id_lst,result_set
result_set=finalize_output(result_set)
return doc_id_lst,result_set
def compute_inverted_idx(doc_vocabs,vocabulary):
"""
method that computes an inverted index
input: doc_vocabs(dictionary), vocabulary(dictionary of all unique words, key=term, value=term_id)
output: inverted_idx(dictionary, key=term_id, value=list of document_ids)
"""
#initialize defaultdict for making an inverted index
inverted_idx = defaultdict(list)
#in every document look for every word and assign document id to the words which belong to it
for idx in doc_vocabs.keys():
for word in doc_vocabs[idx]:
inverted_idx[vocabulary[word]].append(idx)
return inverted_idx
def save_inverted_idx(inverted_idx):
#save it into a file named inverted_idx.p
pickle.dump(inverted_idx, open("inverted_idx.p", "wb"))
def load_inverted_idx():
#load file named inverted_idx.p
return pickle.load(open("inverted_idx.p", "rb"))
#%% Second search engine with a conjunctive query
# First way
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
def calculate_tf_idf(airbnb_data,inverted_idx,vocabulary):
"""
method that computes an inverted index
input: airbnbdata-just for using the number of files we made
inverted_idx(dictionary, key=term_id, value=list of document_ids)
vocabulary(dictionary of all unique words, key=term, value=term_id)
output: tf_idf_dic(dictionary of tf_idf_values for all docs, key=tuple(term,doc_id ), value=tf_idf value)
"""
tf_idf_dic=dict()
#number of .tsv files which were made
total_num_docs=airbnb_data.shape[0]
result_df=pd.DataFrame()
for i in airbnb_data.index:
#take one file
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
#preprocessing
df=df.description[0]+' '+df.title[0]
filtered_words=preprocessing_text(df)
tf_series=pd.Series(filtered_words)
#series of tf values
tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
idf_series=pd.Series(list(set(filtered_words))).sort_values()
#idf calculation
idf_calc=idf_series.apply(lambda x: np.log(total_num_docs/len(inverted_idx[vocabulary[x]])))
#combine tf and idf in one result_df dataframe
result_df=pd.concat([pd.Series(idf_series.values),pd.Series(tf_series.values),pd.Series(idf_calc.values)],axis=1)#.reset_index()
#multiply tf and idf and create tf_idf column
result_df['tf_idf']=result_df[1]*result_df[2]
#key=tuple(term,doc_id), value=tf_idf value
for idx in range(result_df.shape[0]):
tf_idf_dic[result_df[0][idx],i]=result_df['tf_idf'][idx]
return tf_idf_dic
# Second way--to check if it is the same like the 1st-for double checking the results
def calculate_tf_idf2(airbnb_data,inverted_idx,vocabulary):
"""
method that computes an inverted index(stores it differently than the first one just for comparison)
input: airbnbdata-just for using the number of files we made
inverted_idx(dictionary, key=term_id, value=list of document_ids)
vocabulary(dictionary of all unique words, key=term, value=term_id)
output: proba(dictionary of tf_idf_values for all docs)
"""
#store separately tf and idf values into dictionaries
idf_dic2={}
tf_dic2={}
#dictionary for tf_idf values
proba={}
total_num_docs=airbnb_data.shape[0]
for i in airbnb_data.index:
#take one file
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['description','title'],encoding='ISO-8859-1')
#preprocessing
df=df.description[0]+' '+df.title[0]
#preprocessed words
filtered_words=preprocessing_text(df)
#tf values calculations
tf_series=pd.Series(filtered_words)
tf_series=((tf_series.value_counts())/len(tf_series)).sort_index()
#idf values calculations
idf_series=pd.Series(list(set(filtered_words))).sort_values()
idf_calc=idf_series.apply(lambda x: np.log(total_num_docs/len(inverted_idx[vocabulary[x]])))
#store idf values into dict
for idx in range(len(tf_series)):
idf_dic2[idf_series[idx],i]=idf_calc[idx]
#store tf values into dict
for index,value in tf_series.iteritems():
tf_dic2[index,i]=value
#combine tf and idf ito a new dictionary by their multiplication using the same key
for k in tf_dic2.keys():
proba[k]=tf_dic2[k]*idf_dic2[k]
return proba
def compute_inverted_idx2(inverted_idx,vocabulary,tf_idf_dic):
"""
method that computes the second inverted index
input: inverted_idx(dictionary, key=term_id, value=list of document_ids)
vocabulary(dictionary of all unique words, key=term, value=term_id)
tf_idf_dic(dictionary of tf_idf_values for all docs, key=tuple(term,doc_id ), value=tf_idf value)
output: inverted_idx2(dictionary, key=term_id, value=list of tuples (document_id,tf_idf value))
"""
inverted_idx2=defaultdict(list)
#for every term_id from the first inverted index
for term_id in inverted_idx.keys():
#find term from vocabulary
for k,v in vocabulary.items():#k->term, v->term_id
if v==term_id:
term=k
#for every document AKA doc_id from the first inverted index
for doc_id in inverted_idx[term_id]:
inverted_idx2[term_id].append((doc_id,tf_idf_dic[term,doc_id]))
return inverted_idx2
def search_engine2(k,vocabulary,inverted_idx,inverted_idx2):
"""
method that prints the result dataframe with the columns 'title','description','city','url'
based on user query
input: k - number of top documents that should be returned
inverted_idx(dictionary, key=term_id, value=list of document_ids)
vocabulary(dictionary of all unique words, key=term, value=term_id)
inverted_idx2(dictionary, key=term_id, value=list of tuples (document_id,tf_idf value))
output: result_set dataframe based on the user query
"""
#text user searches and preprocessing of that text input
user_query=str(input())
user_query=preprocessing_text(user_query)
#tf_idf values for the user query are array of 1, they have 1 as a tf_idf value
user_query_tfidf=np.ones(len(user_query))
#list of document indices
list_term_idx=[]
#list of dataframes
list_tf_idf=[]
result_set=[]
#for every word in user query
for word in user_query:
#if word exist in the vocabulary
if word in vocabulary.keys():
#append a list of document indices
list_term_idx.append(set(inverted_idx[vocabulary[word]]))
#append a list of tuples from inverted_index2
list_tf_idf.append((inverted_idx2[vocabulary[word]]))
else:
list_term_idx.append({'x'})
break
#result will be intersection of all sets od document ids
result_set=list(set.intersection(*list_term_idx))
if 'x' in result_set or not result_set:
result_set='No results! Try again!'
return result_set
tf_idf_dic=defaultdict(list)
#making of tf_idf_dic dictionary, where key=document_id,value=tf_idf value
for tf_idf_1doc in list_tf_idf:
for tuple_pair in tf_idf_1doc:
if tuple_pair[0] in result_set:
tf_idf_dic[tuple_pair[0]].append(tuple_pair[1])
print(result_set)
result_set=finalize_output2(result_set,user_query_tfidf,tf_idf_dic,k)
return result_set
def cosine_sim_tuples(user_query_tfidf,tf_idf_dic):
"""
method that calculates cosine similarity between user query and every document of
the result set of the query
Input: user_query_tfidf - tf_idf values for the user query are array of 1
tf_idf_dic - dictionary(key=document_id,value=tf_idf value)
Output: cosine_sim_lst_tuples - list of tuples with calculated cosine similarities tuple(cosine similarity,document_id)
"""
cosine_sim_lst_tuples=[]
for key,value in tf_idf_dic.items():
tf_idf_val=cosine_similarity([user_query_tfidf],[value])[0][0]
cosine_sim_lst_tuples.append((tf_idf_val,key))#tuple(cosine similarity,document_id)
return cosine_sim_lst_tuples
def heapify_tuples(cosine_sim_lst_tuples,k):
"""
method that makes heap from list of tuples and returns K largest values based on
the cosine similarity value in heap
Input: cosine_sim_lst_tuples - list of tuples with calculated cosine similarities tuple(cosine similarity,document_id)
k - number of top documents that should be returned
Output: wanted_doc-list of document id's with biggest cosine similarity values
"""
heap = []
for item in cosine_sim_lst_tuples:
heappush(heap, item)
return wanted_doc(nlargest(k,heap))
def wanted_doc(heap_k_docs):
"""
method that returns list of document id's with biggest cosine similarity values
Input: heap_k_docs - list of document indices with K biggest cosine similarity values
Output: df - wanted_doc_ids - list of document id's with biggest cosine similarity values
"""
wanted_doc_ids={}
for tup in (heap_k_docs):
wanted_doc_ids[tup[1]]=round(tup[0],2)
return wanted_doc_ids
def finalize_output2(result_set,user_query_tfidf,tf_idf_dic,k):
"""
method that is used for creating the result dataframe with the columns 'title','description','city','url','similarity'
Input: result_set - list of document indices
user_query_tfidf - tf_idf values for the user query are array of 1
tf_idf_dic - dictionary(key=document_id,value=tf_idf value)
k - number of top documents that should be returned
Output: df - result dataframe
"""
#list of tuples of cosine similarity betewwn user_query and every document of the result set
cosine_sim_lst_tuples=cosine_sim_tuples(user_query_tfidf,tf_idf_dic)
#check if result set is smaller than top K values that should be returned
#if it is smaller return whole result set
if len(result_set)<k:
k=len(result_set)
#'HEAPIFY' list of tuples and return top K document id's from the heap
wanted_doc_ids=heapify_tuples(cosine_sim_lst_tuples,k)
result_set=wanted_doc_ids.keys()
df=pd.DataFrame()
for i,val in enumerate(result_set):
#display whole text in the columns
pd.set_option('display.max_colwidth', -1)
df=df.append(pd.read_csv('data/doc_'+str(val)+'.tsv',sep='\t',usecols=['description','title','city','url']
,encoding='ISO-8859-1'))
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)
#add column with similarity values
df['similarity']=wanted_doc_ids.values()
df=df[['title','description','city','url','similarity']]
return df
def calculate_room_nums(doc_id_rs):
"""
method that returns list of possible room numbers based on first user input and
returns it as a second question so the results can be sorted by their significance
input:list of document id's of the results from the first query
output:list of possible number of rooms user can choose based on the first result
"""
result_df=pd.DataFrame()
l=[]
for i in doc_id_rs:
#take one file
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['bedrooms_count'],encoding='ISO-8859-1')
result_df=result_df.append(df)
temp=np.unique(result_df.bedrooms_count.values)
l=[*temp]
del result_df
return l
def example_score():
"""
a simple method to show an example of the new score calculation for sorting result documents
output:example of user input and dataframe of BR score calculation
"""
user_input=pd.DataFrame(columns=['chosen_avg_price','chosen_no_rooms'])
user_input.chosen_no_rooms=[3]
user_input.chosen_avg_price=[150]
explanatory_df=pd.DataFrame(columns=['bedrooms_count','B score','average_rate_per_night','R score','BR score'])
explanatory_df.average_rate_per_night=[50,100,250,550]
explanatory_df.bedrooms_count=[2,3,3,4]
explanatory_df['B score']=(0,0.25,0.25,0)
explanatory_df['R score']=(0.45,0.25,0.05,0)
explanatory_df['BR score']=explanatory_df['B score']+explanatory_df['R score']
return user_input,explanatory_df
def calculate_score(row,chosen_avg_price,chosen_no_rooms):
"""
a method that calculates new score("BR SCORE") which will be used
for sorting result documents
input: row - one row(document) of result set
chosen_avg_price - price user chose in the additional question
chosen_no_rooms - number of rooms user chose in the additional question
output: calculated score for one row
"""
temp_rate=0
temp_beds=0
#rules of R score calculation
if row['average_rate_per_night']in range(0,int((chosen_avg_price/2)+1)):
temp_rate=0.45
if row['average_rate_per_night']in range(int(chosen_avg_price/2)+1,int(chosen_avg_price)+1):
temp_rate=0.25
if row['average_rate_per_night'] in range(int(chosen_avg_price),int(chosen_avg_price+101)):
temp_rate=0.05
#rules of B score calculation
if row['bedrooms_count'] == chosen_no_rooms:
temp_beds=0.25
#BR score=B score+ R score
return temp_rate+temp_beds
def heapify_tuples_BR(BR_score_tuples):
"""
method that makes heap from list of tuples
Input: BR_score_tuples - list of tuples with calculated BR score->tuple(BR_score,document_id)
Output: list converted to heap structure
"""
heap = []
for item in BR_score_tuples:
heappush(heap, item)
return heap
def new_score(doc_id_rs,chosen_avg_price,chosen_no_rooms):
"""
method that is used for creating list of BR scores for each result row and
heapified list of tuples
Input: doc_id_rs - list of document id's of the results from the first query
chosen_avg_price - price user chose in the additional question
chosen_no_rooms - number of rooms user chose in the additional question
Output: score_lst - list of scores with calculated BR score for each row in the result set
heapified_tuples - heapified list of tuples where ->tuple(BR_score,document_id)
"""
#make subdataframe with columns 'average_rate_per_night','bedrooms_count' which will be used
#for BR score calculation
calc_result_df=pd.DataFrame()
for i in doc_id_rs:
#take one file
df=pd.read_csv('data/doc_'+str(i)+'.tsv',sep='\t',usecols=['average_rate_per_night','bedrooms_count'],encoding='ISO-8859-1')
calc_result_df=calc_result_df.append(df)
#initialize list for BR score for each result set row
score_lst=[]
for idx in range(len(calc_result_df)):
score_lst.append(calculate_score(calc_result_df.iloc[idx],chosen_avg_price,chosen_no_rooms))
#initialize list for BR score for each result set row
BR_score_tuples=[]
for idx,val in enumerate(score_lst):
#BR_score_tuples (BR score,doc_id)
BR_score_tuples.append((round(val,2),doc_id_rs[idx]))#idx index of row
heapified_tuples=heapify_tuples_BR(BR_score_tuples)
return score_lst,heapified_tuples
def ranking_BR_score(heapified_tuples):
"""
method that is used for sorting BR scores and creating ranks for result dataframe
Input: heapified_tuples - heapified list of tuples where ->tuple(BR_score,document_id)
Output: ranking_dict - dictionary(key=doc_id,value=rank)
"""
#select nlargest or in this case all len(heapified_tuples) from the list which basically selects all but sorted
sorted_scores=nlargest(len(heapified_tuples),heapified_tuples)
sorted_docs_dic=defaultdict(list)
for tup in sorted_scores:
sorted_docs_dic[tup[0]].append(tup[1])
sorted_docs_rank_dic=defaultdict(list)
#making of rank so the same score has the same rank
counter_rank=1
for k,v in sorted_docs_dic.items():
k=counter_rank
sorted_docs_rank_dic[k]=v
counter_rank=counter_rank+1
#put it in the OrderedDict so the order doesn't change
ranking_dict=OrderedDict()
for k,v in (sorted_docs_rank_dic.items()):
for list_val in v:
ranking_dict[list_val]=k
return ranking_dict