-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
84 lines (55 loc) · 1.92 KB
/
main.py
File metadata and controls
84 lines (55 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sqlite3
from urllib.parse import urlparse
from termcolor import colored
from spider import spider_website
def create_tables():
cur.executescript('''
CREATE TABLE IF NOT EXISTS Websites (
website_url TEXT UNIQUE
);
CREATE TABLE IF NOT EXISTS Pages (
id INTEGER NOT NULL PRIMARY KEY,
url TEXT UNIQUE,
html TEXT,
error INTEGER DEFAULT NULL,
old_rank REAL,
new_rank REAL
);
CREATE TABLE IF NOT EXISTS Links (
from_page_id INTEGER,
to_page_id INTEGER,
UNIQUE (from_page_id, to_page_id)
)
''')
def main():
global connection, cur
connection = sqlite3.connect('spider_database.sqlite')
cur = connection.cursor()
create_tables()
while True:
print("Please enter url of the form 'https://www.website-name.domain-name/'")
website_url = input("Enter the website url to spider: ")
if len(website_url) < 1:
break
if website_url.endswith("/"): website_url = website_url[:-1]
website_domain = website_url
print(colored(f'website_domain : {website_domain}', 'yellow'))
cur.execute("INSERT OR IGNORE INTO Websites (website_url) VALUES ( ? ) ", (website_domain,))
cur.execute("""
INSERT OR IGNORE INTO Pages
(url, html, error, old_rank, new_rank)
VALUES (?, NULL, NULL, NULL, 1.0)
""", (website_domain, )
)
connection.commit()
cur.execute("SELECT website_url FROM Websites")
rows = cur.fetchall()
all_allowed_websites = []
for row in rows:
all_allowed_websites.append(row[0])
print("Currently spidering " , colored(website_url, 'green'))
cur.close()
spider_website(all_allowed_websites)
break
if __name__ == "__main__":
main()