-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheyewitness_parser.py
More file actions
executable file
·116 lines (93 loc) · 4.8 KB
/
eyewitness_parser.py
File metadata and controls
executable file
·116 lines (93 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import os
from bs4 import BeautifulSoup
import re
import json
import csv
"""
This script regroups targets from an eyewitness report based on their technology.
It uses the following information contained in HTTP responses:
- Page Title (same technology)
- Content-length header (i.e.: same page)
- Server header
"""
identified_technology = {
"page_title":{},
"content_length":{},
"server": {}
}
def write_csv(csv_file):
# Convert dictionary to CSV
with open(csv_file, "w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["Category", "Subcategory", "URL"])
for category, subcategories in identified_technology.items():
for subcategory, urls in subcategories.items():
for url in urls:
writer.writerow([category, subcategory, url])
print(f"CSV file '{csv_file}' has been created.")
def check_and_add_url(dictionary, key, value, url):
if value in dictionary[key]:
dictionary[key][value].append(url)
print(f"Appended {url} to {value} in {key}.")
else:
dictionary[key][value] = [url]
print(f"Added {value} with value {url} to {key}.")
def my_parser(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# Find all <a> tags containing href attribute with "http://"
http_links = soup.find_all('a', href=re.compile(r'^http://'))
# Iterate through each <a> tag
for link in http_links:
url = link.get('href') # Extract the URL
div_tag = link.find_parent('div') # Find the parent <div> tag
if div_tag: # If the parent <div> tag exists
# Extract Page Title, Server, and Content-Length
page_title = div_tag.find('b', text=re.compile(r'Page Title:'))
server = div_tag.find('b', text=re.compile(r'Server:'))
content_length = div_tag.find('b', text=re.compile(r'Content-Length:'))
# Extract the values if found
if page_title:
page_title = page_title.next_sibling.strip()
check_and_add_url(identified_technology, "page_title", page_title, url)
if server:
server = server.next_sibling.strip()
check_and_add_url(identified_technology, "server", server, url)
if content_length:
content_length = content_length.next_sibling.strip()
check_and_add_url(identified_technology, "content_length", content_length, url)
print("DONE")
# Print the dictionary in JSON format
print(json.dumps(identified_technology, indent=4))
def parse_html_files(directory):
# Check if the directory exists
if not os.path.isdir(directory):
print("Error: Directory does not exist.")
return
# Iterate through all files in the directory
for filename in os.listdir(directory):
if filename.endswith(".html"):
filepath = os.path.join(directory, filename)
# Open one HTML file
with open(filepath, "r") as file:
html_content = file.read()
# Parse the HTML content
my_parser(html_content)
def read_file(filename):
try:
with open(filename, 'r') as file:
content = file.read()
print("File contents:")
print(content)
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="List files in a directory.")
parser.add_argument("-d", "--directory", required=True, help="Path to the directory to list files from")
parser.add_argument("-c", "--csv", required=True, help="Filename of the csv output")
args = parser.parse_args()
directory_path = args.directory
csv_file = args.csv
parse_html_files(directory_path)
if csv_file:
write_csv(csv_file)