-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvisualization.py
More file actions
442 lines (375 loc) · 17.5 KB
/
visualization.py
File metadata and controls
442 lines (375 loc) · 17.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import pylab as pl
import codecs
import operator
from collections import Counter
import offTheShelf
import hw5method
# NOTE: Since show is a blocking function, it won't show the next graph until
# you've exited out of your current one
def bar_plot(rating_count, title):
'''
This function plots the rating count given in a bar graph
Input:
rating_count: a list containing the number of each rating
title: what the title of the plot should be
Output:
shows the plot
saves the plot under the title given
'''
fig = plt.figure(1)
ax = fig.add_subplot(111)
width = 0.35
ind = np.arange(5) # x locations for the ratings
rectangles = ax.bar(ind, rating_count, width, color='black')
ax.set_xlim(-width,len(ind)-width)
ax.set_ylim(0,max(rating_count))
ax.set_xlabel('Rating')
ax.set_ylabel('Frequency')
ax.set_title(title)
xTickMarks = [str(i) for i in range(1,6)]
ax.set_xticks(ind)
xtickNames = ax.set_xticklabels(xTickMarks)
plt.setp(xtickNames, fontsize=10)
plt.savefig(title)
plt.show()
def categorize(genre_dict, movieIDs):
'''
This function gives coordinates for each movie given the data we read in
and some iterable item containing all the ID's of the desired movies
Input:
genre_dict: dictionary of the lists of genre specifcations
movieIDs: the IDs of the movies we are considering
Output:
a list of coordinates categorizing how serious(+)/escapist(-) and
aldrenaline-rush(+)/light-hearted(-) the movie is
'''
###
# 0-Unknown, 1-Action, 2-Adventure, 3-Animation, 4-Childrens, 5-Comedy,
# 6-Crime, 7-Documentary, 8-Drama, 9-Fantasy, 10-Film-Noir, 11-Horror,
# 12-Musical, 13-Mystery, 14-Romance, 15-Sci-Fi, 16-Thriller, 17-War, 18-Western
###
coordinates = []
for ID in movieIDs:
fields = genre_dict[ID]
# Serious = [Crime, Documentary, Drama, Film-noir, Mystery, War]
# Escapist = [Fantasy, Horror, Romance, Sci-Fi, Thriller, Western]
# Adrenaline-Rush = [Action, Adventure, Crime, Horror, Mystery, Thriller]
# Light-Hearted = [Animation, Childrens, Comedy, Documentary, Musical, Romance]
get_serious = operator.itemgetter(6,7,8,10,13,17)
get_escapist = operator.itemgetter(9,11,14,15,16,18)
get_adrenaline = operator.itemgetter(1,2,6,11,13,16)
get_light = operator.itemgetter(3,4,5,7,12,14)
serious_count = sum(get_serious(fields))
escapist_count = sum(get_escapist(fields))
adrenaline_count = sum(get_adrenaline(fields))
light_count = sum(get_light(fields))
coordinates.append((adrenaline_count-light_count, serious_count-escapist_count))
return coordinates
def fancy_plot(genre_dict, movieIDs, movieNames, title):
'''
This function plots the rating count given in a bar graph
Input:
genre_dict: dictionary of the lists of genre specifications
movieIDs: the IDs of the movies we are considering
movieNames: the names of the movies we are considering
title: what the title of the plot should be
Output:
shows the plot
saves the plot under the title given
'''
# Fancy visualization
coordinates = categorize(genre_dict, movieIDs)
X = [x[0] for x in coordinates]
Y = [x[1] for x in coordinates]
fig = pl.figure()
ax = fig.add_subplot(111)
ax.plot(X, Y,'ko',markersize=8)
lim = max([max(X), max(Y), abs(min(X)), abs(min(Y))])
xmin, xmax = (-lim-0.5,lim+0.5)
ymin, ymax = (-lim-0.5,lim+0.5)
ax.set_xlim([-lim-0.5,lim+0.5])
ax.set_ylim([-lim-0.5,lim+0.5])
# plot movie names
# for label, xpt, ypt in zip(movieNames, X, Y):
# ax.text(xpt-0.5, ypt-0.3, label)
ax.text(X[0]-0.7, Y[0]-0.3, movieNames[0])
ax.text(X[1]+0.1, Y[1]+0.2, movieNames[1])
ax.text(X[2]-0.5, Y[2]-0.3, movieNames[2])
ax.text(X[3]-0.9, Y[3]-0.6, movieNames[3])
ax.text(X[4]-0.5, Y[4]-0.3, movieNames[4])
ax.text(X[5]-1.4, Y[5]+0.3, movieNames[5])
ax.text(X[6]-0.5, Y[6]-0.3, movieNames[6])
ax.text(X[7]-0.5, Y[7]-0.4, movieNames[7])
ax.text(X[8]-0.5, Y[8]+0.2, movieNames[8])
ax.text(X[9]-0.5, Y[9]-0.3, movieNames[9])
# plot classifications
ax.text(0.2, ymax-0.1, 'Serious', fontweight='bold')
ax.text(0.2, ymin, 'Escapist', fontweight='bold')
ax.text(xmax-1.5, 0.3, 'Adrenaline-Rush', fontweight='bold')
ax.text(xmin, 0.3, 'Light-Hearted', fontweight='bold')
# removing the default axis on all sides:
for side in ['bottom','right','top','left']:
ax.spines[side].set_visible(False)
# removing the axis ticks
pl.xticks([]) # labels
pl.yticks([])
ax.xaxis.set_ticks_position('none') # tick markers
ax.yaxis.set_ticks_position('none')
# get width and height of axes object to compute
# matching arrowhead length and width
dps = fig.dpi_scale_trans.inverted()
bbox = ax.get_window_extent().transformed(dps)
width, height = bbox.width, bbox.height
# manual arrowhead width and length
hw = 1./20.*(ymax-ymin)
hl = 1./20.*(xmax-xmin)
lw = 1. # axis line width
ohg = 0.3 # arrow overhang
# compute matching arrowhead length and width
yhw = hw/(ymax-ymin)*(xmax-xmin)* height/width
yhl = hl/(xmax-xmin)*(ymax-ymin)* width/height
# draw x and y axis
ax.arrow(0, 0, xmax, 0, fc='k', ec='k', lw = lw,
head_width=hw, head_length=hl, overhang = ohg,
length_includes_head= True, clip_on = False)
ax.arrow(0, 0, 0, ymax, fc='k', ec='k', lw = lw,
head_width=yhw, head_length=yhl, overhang = ohg,
length_includes_head= True, clip_on = False)
ax.arrow(0, 0, xmin, 0, fc='k', ec='k', lw = lw,
head_width=yhw, head_length=yhl, overhang = ohg,
length_includes_head= True, clip_on = False)
ax.arrow(0, 0, 0, ymin, fc='k', ec='k', lw = lw,
head_width=yhw, head_length=yhl, overhang = ohg,
length_includes_head= True, clip_on = False)
pl.tight_layout()
pl.savefig(title)
pl.show()
def get_rating_freq(data, movieIDs):
'''
This function gets relative frequency of each rating given the data
we read in and somee iterable item containing all the ID's of the
movies we want to tally
Input:
data: the rating data that we read in
movieIDs: the IDs of the movies we are considering for the tally
Output:
a list of how the frequency of each rating, from 1 to 5
'''
r1, r2, r3, r4, r5 = 0, 0, 0, 0, 0
for rating in data:
if rating[1] in movieIDs:
if rating[2] == 1: r1 += 1
elif rating[2] == 2: r2 += 1
elif rating[2] == 3: r3 += 1
elif rating[2] == 4: r4 += 1
elif rating[2] == 5: r5 += 1
f1 = r1/(r1+r2+r3+r4+r5)
f2 = r2/(r1+r2+r3+r4+r5)
f3 = r3/(r1+r2+r3+r4+r5)
f4 = r4/(r1+r2+r3+r4+r5)
f5 = r5/(r1+r2+r3+r4+r5)
return [f1, f2, f3, f4, f5]
def matrix_factorization_visualization(V, movieIDs, movie_names, title):
print("Starting matrix factorization visualization")
x_coords = []
y_coords = []
for m in movieIDs:
x_coords.append(V[0][m-1])
y_coords.append(V[1][m-1])
fig, ax = plt.subplots()
ax.scatter(x_coords, y_coords)
if movie_names != None:
for i, txt in enumerate(movie_names):
ax.annotate(txt, (x_coords[i], y_coords[i]))
plt.xticks([])
plt.yticks([])
plt.title(title)
# plt.tight_layout()
plt.savefig(title, bbox_inches="tight")
plt.show()
def matrix_factorization_visualization2(V, movieIDs1, movieIDs2, title, label1, label2):
print("Starting matrix factorization visualization for 2 types of movies")
x_coords = []
y_coords = []
for m in movieIDs1:
x_coords.append(V[0][m-1])
y_coords.append(V[1][m-1])
x_coords2 = []
y_coords2 = []
for m in movieIDs2:
x_coords2.append(V[0][m-1])
y_coords2.append(V[1][m-1])
fig, ax = plt.subplots()
ax2 = ax.scatter(x_coords2, y_coords2)
ax1 = ax.scatter(x_coords, y_coords, c='r')
plt.legend((ax1, ax2), (label1, label2), scatterpoints = 1, loc='best')
plt.xticks([])
plt.yticks([])
plt.title(title)
plt.tight_layout()
plt.savefig(title)
plt.show()
def matrix_factorization_visualization19(V, movieIDs, title):
print("Starting matrix factorization visualization for all movies")
length = len(movieIDs)
x_coords = [[] for _ in range(length)]
y_coords = [[] for _ in range(length)]
for m in range(length):
for i in movieIDs[m]:
x_coords[m].append(V[0][i-1])
y_coords[m].append(V[1][i-1])
fig, ax = plt.subplots()
# colors = np.arange(length)
# c = [i / length for i in colors]
# c = cm.jet(colors)
c = cm.rainbow(np.linspace(0, 1, length))
handles = []
for i in range(length):
handle = ax.scatter(x_coords[i], y_coords[i], c=c[i])
handles.append(handle)
# 0-Unknown, 1-Action, 2-Adventure, 3-Animation, 4-Childrens, 5-Comedy,
# 6-Crime, 7-Documentary, 8-Drama, 9-Fantasy, 10-Film-Noir, 11-Horror,
# 12-Musical, 13-Mystery, 14-Romance, 15-Sci-Fi, 16-Thriller, 17-War, 18-Western
labels = ["Unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime",
"Documentary", "Drama", "Fantasy", "Film Noir", "Horror", "Musical", "Mystery", "Romance",
"SciFi", "Thriller", "War", "Western"]
plt.legend(handles, labels, scatterpoints=1, loc='upper right', bbox_to_anchor=(1,1))
plt.xticks([])
plt.yticks([])
plt.title(title)
plt.tight_layout()
plt.savefig(title)
plt.show()
def main():
# Y_train = np.loadtxt('./data/train.txt').astype(int)
# Y_test = np.loadtxt('./data/test.txt').astype(int)
data = np.loadtxt('./data/data.txt').astype(int)
movie_file = codecs.open('./data/movies.txt', mode='r', encoding='windows-1252')
movie_names = {}
genres = {}
for line in movie_file:
movie_info = line.split()
movie_names[int(movie_info[0])] = " ".join(movie_info[1:-19])
genres[int(movie_info[0])] = list(map(int, movie_info[-19:]))
M = max(data[:,0]).astype(int) # users
N = max(data[:,1]).astype(int) # movies
# 1. All movies
frequencies = Counter(data[:,1]) # how often the movies are reviewed
avg_ratings = {}
for data_tuple in data:
key = data_tuple[1]
avg_ratings[key] = avg_ratings.get(key, 0) + data_tuple[2]/frequencies[key]
rating_count = get_rating_freq(data, [ID for ID in movie_names])
# bar_plot(rating_count, 'Ratings of All Movies')
# 2. Ten most popular movies
most_reviewed = frequencies.most_common(10)
pop_movie_IDs = [x[0] for x in most_reviewed]
pop_movie_names = [movie_names[ID] for ID in pop_movie_IDs]
pop_movie_genres = [genres[ID] for ID in pop_movie_IDs]
print("Most Reviewed Movies: ", pop_movie_names)
# Fancy plot
# fancy_plot(genres, pop_movie_IDs, pop_movie_names, 'Visualization of Ten Most Popular Movies')
# Histogram
pop_rating_count = get_rating_freq(data, pop_movie_IDs)
# bar_plot(pop_rating_count, 'Ratings of Ten Most Popular Movies')
# 3. Top ten best movies
best_reviewed = dict(Counter(avg_ratings).most_common(10))
best_reviewed_names = [movie_names[ID] for ID in best_reviewed]
print("BEST REVIEWED: ", list(best_reviewed.keys()))
# best_reviewed_genres = [genres[ID] for ID in best_reviewed]
# print("Best Movies: ", best_reviewed_names)
best_rating_count = get_rating_freq(data, best_reviewed)
# bar_plot(best_rating_count, 'Ratings of Ten Best Movies')
# 4. Three genres of your choice - 2:Adventure, 7:Documentary, 17:War
adventure_movies = [ID for ID in genres if genres[ID][2] == 1]
adventure_movie_names = [movie_names[ID] for ID in adventure_movies]
# action movie plotting
adventure_rating_count = get_rating_freq(data, adventure_movies)
# bar_plot(adventure_rating_count, 'Ratings of Adventure Movies')
documentary_movies = [ID for ID in genres if genres[ID][7] == 1]
documentary_movie_names = [movie_names[ID] for ID in documentary_movies]
# plotting documentaries
documentary_count = get_rating_freq(data, documentary_movies)
# bar_plot(documentary_count, 'Ratings of Documentaries')
war_movies = [ID for ID in genres if genres[ID][17] == 1]
war_movie_names = [movie_names[ID] for ID in war_movies]
# plotting war movies
war_rating_count = get_rating_freq(data, war_movies)
# bar_plot(war_rating_count, 'Ratings of War Movies')
comedy_movies = [ID for ID in genres if genres[ID][5] == 1]
comedy_movie_names = [movie_names[ID] for ID in comedy_movies]
# 0-Unknown, 1-Action, 2-Adventure, 3-Animation, 4-Childrens, 5-Comedy,
# 6-Crime, 7-Documentary, 8-Drama, 9-Fantasy, 10-Film-Noir, 11-Horror,
# 12-Musical, 13-Mystery, 14-Romance, 15-Sci-Fi, 16-Thriller, 17-War, 18-Western
action_movies = [ID for ID in genres if genres[ID][1] == 1]
childrens_movies = [ID for ID in genres if genres[ID][4] == 1]
fantasy_movies = [ID for ID in genres if genres[ID][9] == 1]
film_noir_movies = [ID for ID in genres if genres[ID][10] == 1]
horror_movies = [ID for ID in genres if genres[ID][11] == 1]
scifi_movies = [ID for ID in genres if genres[ID][15] == 1]
all_movies = [[] for _ in range(19)]
for i in range(19):
all_movies[i] = [ID for ID in genres if genres[ID][i] == 1]
'''
Number of movies in each category
0 2
1 251
2 135
3 42
4 122
5 505
6 109
7 50
8 725
9 22
10 24
11 92
12 56
13 61
14 247
15 101
16 251
17 71
18 27
'''
U, V = hw5method.main()
matrix_factorization_visualization(V, pop_movie_IDs, pop_movie_names, "Advanced Predictions of Popular Movies")
matrix_factorization_visualization(V, best_reviewed, best_reviewed_names, "Advanced Predictions of Best Movies")
matrix_factorization_visualization(V, adventure_movies, None, "Advanced Predictions of Adventure Movies")
matrix_factorization_visualization(V, documentary_movies, None, "Advanced Predictions of Documentaries")
matrix_factorization_visualization(V, war_movies, None, "Advanced Predictions of War Movies")
matrix_factorization_visualization(V, comedy_movies, None, "Advanced Predictions of Comedy Movies")
matrix_factorization_visualization2(V, action_movies, adventure_movies, "Advanced Predictions of Action vs Adventure Movies", "Action", "Adventure")
matrix_factorization_visualization2(V, film_noir_movies, comedy_movies, "Advanced Predictions of Film Noir vs Comedy Movies", "Film Noir", "Comedy")
matrix_factorization_visualization2(V, documentary_movies, action_movies, "Advanced Predictions of Documentaries vs Action Movies", "Documentary", "Action")
matrix_factorization_visualization2(V, documentary_movies, comedy_movies, "Advanced Predictions of Documentaries vs Comedy Movies", "Documentary", "Comedy")
matrix_factorization_visualization2(V, film_noir_movies, documentary_movies, "Advanced Predictions of Documentaries vs Film Noir Movies", "Film Noir", "Documentary")
matrix_factorization_visualization2(V, fantasy_movies, documentary_movies, "Advanced Predictions of Documentaries vs Fantasy Movies", "Fantasy", "Documentary")
matrix_factorization_visualization2(V, documentary_movies, scifi_movies, "Advanced Predictions of Documentaries vs SciFi Movies", "Documentary", "Scifi")
matrix_factorization_visualization2(V, documentary_movies, horror_movies, "Advanced Predictions of Documentaries vs Horror Movies", "Documentary", "Horror")
matrix_factorization_visualization2(V, war_movies, childrens_movies, "Advanced Predictions of War vs Childrens Movies", "War", "Childrens")
matrix_factorization_visualization19(V, all_movies, "Advanced Predictions of All Movies")
# rand_movie_ids = np.random.randint(1, 1682, 10)
rand_movie_ids = [530, 1630, 423, 399, 1307, 1491, 1144, 230, 625, 455]
rand_movie_names = []
for movieID in rand_movie_ids:
rand_movie_names.append(movie_names[movieID])
matrix_factorization_visualization(V, rand_movie_ids, rand_movie_names, "Advanced Predictions of Random Movies")
print("Finished visualizations using hw5 methods")
# Fancy plots for 5.2
U, V = offTheShelf.main()
matrix_factorization_visualization(V, pop_movie_IDs, pop_movie_names, "2D Visualization of Ten Most Popular Movies")
matrix_factorization_visualization(V, best_reviewed, best_reviewed_names, "2D Visualization of Ten Best Movies")
matrix_factorization_visualization(V, adventure_movies, None, "2D Visualization of Adventure Movies")
matrix_factorization_visualization(V, documentary_movies, None, "2D Visualization of Documentaries")
matrix_factorization_visualization(V, war_movies, None, "2D Visualization of War Movies")
matrix_factorization_visualization(V, comedy_movies, None, "2D Visualization of Comedy Movies")
matrix_factorization_visualization2(V, action_movies, adventure_movies, "2D Visualization of Action vs Adventure Movies", "Action", "Adventure")
matrix_factorization_visualization2(V, film_noir_movies, comedy_movies, "2D Visualization of Film Noir vs Comedy Movies", "Film Noir", "Comedy")
matrix_factorization_visualization19(V, all_movies, "2D Visualization of All Movies")
matrix_factorization_visualization(V, rand_movie_ids, rand_movie_names, "2D Visualization of Random Movies")
if __name__ == "__main__":
main()