-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgeneral.py
More file actions
68 lines (51 loc) · 1.79 KB
/
general.py
File metadata and controls
68 lines (51 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
# Each website you crawl is a separate project (folder)
def create_project_dir(directory):
# check if a site has been crawled
if not os.path.exists(directory):
print("Creating projrcct " + directory)
os.mkdir(directory)
# create queue and crawled files (if not created)
def create_data_files(project_name, base_url):
# add url to the queue and wait to be crawl
queue = project_name + "/queue.txt"
# add url that has been crawled
crawled = project_name + "/crawled.txt"
# check if this file does exist
if not os.path.isfile(queue):
write_file(queue, base_url)
if not os.path.isfile(crawled):
# we create empty content
write_file(crawled, '')
# create a new file
def write_file(path, data):
f = open(path, 'w')
f.write(data)
f.close()
# para 1: project name; para 2: url to that website's home page
# create_data_files('thenewboston', 'https://thenewboston.com/')
# add data onto an existing file
def append_to_file(path, data):
# "a" means append
with open(path, 'a') as file:
# jump to a new line for each data
file.write(data + '\n')
# delete the contents of a file
def delete_fiile_contents(path):
with open(path, 'w'):
# do nothing
pass
# read a file and convert each line to set items so that it will speed up the process
def file_to_set(file_name):
result = set()
# "rt" = read text file
with open(file_name, 'rt') as f:
for line in f:
result.add(line.replace('\n', ''))
return result
# iterate through a set, each item will be a new line in the file
def set_to_file(links, file):
# delete old data coz the newer data is in links
delete_fiile_contents(file)
for link in sorted(links):
append_to_file(file, link)