-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathscrapy_top10.py
More file actions
104 lines (85 loc) · 2.85 KB
/
scrapy_top10.py
File metadata and controls
104 lines (85 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Script to scrape data from IMDB database
# need to install from:
# pip install cinemagoer
from imdb import Cinemagoer
import os
# create an instance of the Cinemagoer class
ia = Cinemagoer()
def insert_person(type, person):
person_id = person.personID
person_name = person["name"]
names = person_name.split(" ")
first_name = ""
surname = ""
index = 0
for part in names:
if index == 0:
first_name = part
elif index == 1:
surname += part
else:
surname += " " + part
index += 1
return f"INSERT IGNORE INTO {type} VALUES ({person_id}, \"{first_name}\", \"{surname}\");\n"
top = ia.get_top250_movies()[:10]
insert_movies = "BEGIN;\n"
insert_actors = "BEGIN;\n"
insert_genres = "BEGIN;\n"
insert_directors = "BEGIN;\n"
insert_movie_genres = "BEGIN;\n"
insert_directions = "BEGIN;\n"
insert_casts = "BEGIN;\n"
count = 0
genre_id = 1
genre_ids = {}
for movie in top:
id = movie.movieID
mv = ia.get_movie(id)
name = mv["title"]
year = mv["year"]
runtime = mv["runtimes"]
rate = mv["rating"]
insert_movie = f"INSERT INTO Movie VALUES ({id}, \"{name}\", {year}, {int(runtime[0])}, {rate}, 0);\n"
insert_movies += insert_movie
genres = mv["genres"]
for genre in genres:
if genre not in genre_ids:
genre_ids[genre] = genre_id
insert_genre = f"INSERT INTO Genre VALUES ({genre_id}, \"{genre}\");\n"
insert_genres += insert_genre
genre_id += 1
insert_movie_genre = f"INSERT INTO Movie_Genre VALUES ({genre_ids[genre]}, {mv.movieID});\n"
insert_movie_genres += insert_movie_genre
for cast in mv["cast"]:
insert_actor = insert_person("Actor", cast)
insert_actors += insert_actor
insert_cast = f"INSERT INTO Cast VALUES ({cast.personID}, {mv.movieID});\n"
insert_casts += insert_cast
for director in mv["directors"]:
insert_director = insert_person("Director", director)
insert_directors += insert_director
insert_direction = f"INSERT INTO Direction VALUES ({director.personID}, {mv.movieID});\n"
insert_directions += insert_direction
count += 1
print("processing movie ", count)
print('done!')
insert_movies += "COMMIT;\n"
insert_actors += "COMMIT;\n"
insert_genres += "COMMIT;\n"
insert_directors += "COMMIT;\n"
insert_movie_genres += "COMMIT;\n"
insert_directions += "COMMIT;\n"
insert_casts += "COMMIT;\n"
cur_path = os.path.dirname(__file__)
new_path = os.path.relpath('../sql/sample-data.sql', cur_path)
file = open(new_path , "w", encoding="utf-8")
file.write(insert_movies)
file.close()
file = open(new_path , "a", encoding="utf-8")
file.write(insert_actors)
file.write(insert_genres)
file.write(insert_directors)
file.write(insert_movie_genres)
file.write(insert_directions)
file.write(insert_casts)
file.close()