-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataHandler.py
More file actions
119 lines (90 loc) · 3.79 KB
/
Copy pathDataHandler.py
File metadata and controls
119 lines (90 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#import self as self
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import KNNBaseline
from surprise import Trainset
import csv
from collections import defaultdict
from surprise import Dataset
from surprise import Reader
class DataHandler:
rating = './ml-latest-small/ratings.csv'
movies = './ml-latest-small/movies.csv'
# for testing purpose
# rating = './test-data/ratings.csv'
# movies = './test-data/movies.csv'
def LoadRating(self):
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
return Dataset.load_from_file(self.rating, reader=reader)
def loadPopularityData(self):
# similart to getOrDefault in Java
ratingTimes = defaultdict(int)
rankings = defaultdict(int)
with open(self.rating, newline='') as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
movieId = int(row[1])
ratingTimes[movieId] += 1
rank = 1
for movieID, count in sorted(ratingTimes.items(), key=lambda x: x[1], reverse=True):
rankings[movieID] = rank
rank += 1
return rankings
def getEvaluation(self):
return self.fulldata
def getRank(self):
return self.popularitydata
def __init__(self):
# build the full data
self.fulldata = self.LoadRating()
self.fulldata = self.fulldata
self.popularitydata = self.loadPopularityData()
self.fullTrainData = self.fulldata.build_full_trainset()
#build the full anti data test set
self.fullAntiTestData = self.fullTrainData.build_anti_testset()
self.fullTestData = self.fullTrainData.build_testset()
#get 80% train data and 20% test data
self.traindata, self.testdata = train_test_split(self.fulldata, test_size=0.2)
#build leave-one-out cross validation
self.LOO_Data = LeaveOneOut()
for train, test in self.LOO_Data.split(self.fulldata):
self.LOO_Train = train
self.LOO_Test = test
self.LOOAntiTest = self.LOO_Train.build_anti_testset()
#pass the popularitydata
self.rank = self.popularitydata
#similarity used for diversity
sim_options = {'name': 'cosine', 'user_based': False} # compute similarities between items
self.sim_matrix = KNNBaseline(sim_options=sim_options)
self.sim_matrix.fit(self.fullTrainData)
#getter
def GetFullTrainData(self):
return self.fullTrainData
def GetAntiTestData(self):
return self.fullAntiTestData
def GetAntiUserTestData(self,userId): #the same logic as the build_anti_test but for the spefic user
trainset = self.fullTrainData
temp = trainset.global_mean
antiUserDataSet = []
uidint = trainset.to_inner_uid(str(userId)) #find the specific user inner id
user_watched_movies =set(x for (x,y) in trainset.ur[uidint]) #since int the train set, we use innter id
antiUserDataSet+=[(trainset.to_raw_uid(uidint),trainset.to_raw_iid(i),temp) for i in trainset.all_items()
if i not in user_watched_movies] #since we find the data in the pandas later, we record the raw id
return antiUserDataSet
def GetFullTestData(self):
return self.fullTestData
def GetTrainData(self):
return self.traindata
def GetTestData(self):
return self.testdata
def GetLOOTrain(self):
return self.LOO_Train
def GetLOOTest(self):
return self.LOO_Test
def GetLOOAntiTestSet(self):
return self.LOOAntiTest
def GetPopularRankings(self):
return self.rank
def GetSimilarities(self):
return self.sim_matrix