-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathForm Type
More file actions
94 lines (86 loc) · 4.73 KB
/
Form Type
File metadata and controls
94 lines (86 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import requests
import time
folder = input("Input the folder the files are in: ")
input_file_name = input("Input the name of the links file: ")
output_file_name = input("Input the name of the output file: ")
errors2_file_name = input("Input the name of the file for links that don't have permission: ")
errors_file_name = input("Input the name of the other errors file(links that don't work for another reason): ")
i = int(input("Input where you left off (type: 0 if you just started): "))# left off
input_file = open(folder + "/" + input_file_name, "r")
links = input_file.read().split("\n")
input_file.close()
number_of_links = len(links)
end = int(input("There are "+str(number_of_links)+" links. Input number of file that you want to end at: "))
output_file = open(folder + "/" + output_file_name, "a+")
error_file = open(folder + "/" + errors_file_name, "a+")
error2_file = open(folder + "/" + errors2_file_name, "a+")
form4s = 0
total_forms = 0
while i < end:
try:
if "index.htm" in links[i]:
total_forms += 1
print("accessing: " + links[i] + "(" + str(i + 1) + "/" + str(number_of_links) + ")")
html = requests.get(links[i]).text
html_processed = html.replace("<strong>", ";").replace("</strong>", ";")
html_split = html_processed.split(";")
form_type = html_split[16].replace("\n", "")
output_file.write(links[i] + "," + html_split[16] + "\n")
print(links[i] + "," + form_type + " added to " + output_file_name)
if form_type == "Form 4":
form4s += 1
elif ".txt" in links[i]:
total_forms += 1
print("accessing: " + links[i] + "(" + str(i + 1) + "/" + str(number_of_links) + ")")
html = requests.get(links[i]).text
html_processed = html.replace("<", ";").replace(">", ";")
html_split = html_processed.split(";")
form_type = html_split[html_split.index("TYPE") + 1].replace("\n", "")
output_file.write(links[i] + "," + "Form " + form_type + "\n")
print(links[i] + "," + "Form " + form_type + " added to " + output_file_name)
if form_type == "4" or "Form 4":
form4s += 1
elif ".sgml" in links[i]:
total_forms += 1
print("accessing: " + links[i] + "(" + str(i + 1) + "/" + str(number_of_links) + ")")
html = requests.get(links[i]).text
html_processed = html.replace("<", ";").replace(">", ";").replace("\n", "")
html_split = html_processed.split(";")
form_type = html_split[html_split.index("FORM-TYPE") + 1].replace("\n", "")
output_file.write(links[i] + "," + "Form " + form_type + "\n")
print(links[i] + "," + "Form " + form_type + " added to " + output_file_name)
if form_type == "Form 4":
form4s += 1
elif ".zip" in links[i] or ".paper" in links[i]:
print(links[i] + " (" + str(i + 1) + "/" + str(number_of_links) + ") is skipped (zip/paper file) and added to the errors file")
error_file.write(links[i] + ", zip/paper" + "\n")
else:
print("accessing: " + links[i] + "(" + str(i + 1) + "/" + str(number_of_links) + ")")
html = requests.get(links[i]).text
if "AccessDenied" in html:
print(links[i] + " (" + str(i + 1) + "/" + str(number_of_links) + ") is skipped (access denied) and added to the errors file")
error2_file.write(links[i] + ", access denied" + "\n")
else:
print(links[i] + " (" + str(i + 1) + "/" + str(number_of_links) + ") is skipped (unknown format) and added to the errors file")
error_file.write(links[i] + ", unknown format" + "\n")
i += 1
except Exception as e:
print("accessing: " + links[i] + "(" + str(i + 1) + "/" + str(number_of_links) + ")")
html = requests.get(links[i]).text
if "AccessDenied" in html:
print(links[i] +" (" + str(i + 1) + "/" + str(number_of_links) + ") is skipped (access denied) and added to the errors file")
error2_file.write(links[i] + ", access denied" + "\n")
i += 1
elif e == ConnectionResetError:
print("Connection reset by remote host. Waiting 10 minutes to try again")
time.sleep(600)
else:
print(links[i] + " (" + str(i + 1) + "/" + str(number_of_links) + ") is skipped" + " (" + str(e) + ") " + "and added to the errors file")
error_file.write(links[i] + ", " + str(e) + "\n")
i += 1
output_file.close()
error_file.close()
error2_file.close()
print("done")
print("ended at: " + str(end))
print(str(form4s)+"/"+str(total_forms) + " accessible forms are Form 4 (about)")