-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathFileRecovery.py
More file actions
273 lines (244 loc) · 11.9 KB
/
FileRecovery.py
File metadata and controls
273 lines (244 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python3
#
# FileRecovery.py
#
# Version 1.0 1/4/21
#
# Required input: disk image
# Developed using Python 3.8.6
#
# Usage:
# ./FileRecovery.py diskimage.dd
# OR
# python3 FileRecovery.py diskimage.dd
#
# Current Supported Recovered File Types:
# MPG, PDF, GIF, JPG, DOCX, AVI, PNG
#
# Description:
# Recovers files from a disk image using regex searches for
# haeader and footer patterns.
# Outputs file name, start offset, end offset, and sha-256 hash.
import re # used for regex queries
import struct # used to convert hex bytes to long integer
import binascii # used to convert hex strings to bytes
import argparse # parses command line arguments
import hashlib # used to calculate hashes
import sys # used to exit upon error
# List of file signatures
#
# Each item in the list uses the following format:
# [file extension, header (in hex bytes), footer (in hex bytes)]
#
# If None is in the place of the footer, this indicates that file type
# does not have a footer and other means (such as a file size)
# must be used to find the enod of the file.
#
# If there are multiple types of headers/footers for a file type,
# multiple entries can be made in the signatures list.
signatures = [
['.mpg', b'\x00\x00\x01\xB3.\x00', b'\x00\x00\x00\x01\xB7'],
['.mpg', b'\x00\x00\x01\xBA.\x00', b'\x00\x00\x00\x01\xB9'],
['.pdf', b'\x25\x50\x44\x46', b'\x0A\x25\x25\x45\x4F\x46\x0A'],
['.pdf', b'\x25\x50\x44\x46', b'\x0D\x0A\x25\x25\x45\x4F\x46\x0D\x0A'],
['.pdf', b'\x25\x50\x44\x46', b'\x0A\x25\x25\x45\x4F\x46\x0A'],
['.pdf', b'\x25\x50\x44\x46', b'\x0A\x25\x25\x45\x4F\x46'],
['.pdf', b'\x25\x50\x44\x46', b'\x0D\x25\x25\x45\x4F\x46\x0D'],
['.bmp', b'\x42\x4D....\x00\x00\x00\x00', None],
['.gif', b'\x47\x49\x46\x38\x37\x61', b'\x00\x00\x3B'],
['.gif', b'\x47\x49\x46\x38\x39\x61', b'\x00\x00\x3B'],
['.jpg', b'\xFF\xD8\xFF\xE0', b'\xFF\xD9'],
['.jpg', b'\xFF\xD8\xFF\xE1', b'\xFF\xD9'],
['.jpg', b'\xFF\xD8\xFF\xE2', b'\xFF\xD9'],
['.jpg', b'\xFF\xD8\xFF\xE8', b'\xFF\xD9'],
['.jpg', b'\xFF\xD8\xFF\xDB', b'\xFF\xD9'],
['.docx', b'\x50\x4B\x03\x04\x14\x00\x06\x00', b'\x50\x4B\x05\x06'],
['.avi', b'\x52\x49\x46\x46....\x41\x56\x49\x20\x4C\x49\x53\x54', None],
['.png', b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A', b'\x49\x45\x4E\x44\xAE\x42\x60\x82']
]
# 64 kb chunk size
BUF_SIZE = 65536
def main():
# Command line parser
parser = argparse.ArgumentParser()
parser.add_argument("disk_image")
args = parser.parse_args()
if args is None:
sys.exit()
file_name = args.disk_image
# List of header offsets already used
headers = []
# List of footer offsets already used
footers = []
# File counter
count = 1
# Open the file name from the command line argument
# Read the file in binary
file = open(file_name, "rb")
b = file.read()
file.close()
# Skip flags that are used to ensure only valid files are carved
# True = offsets not valid
# False = offsets are valid
head_skip = False
foot_skip = False
pdf_skip = False
# Iterate through each of the file types in the signatures list
for sig in signatures:
# Compile a regex pattern using the hex bytes for the header
reg_head = re.compile(sig[1])
# Iterate through all matches for the header pattern
# Source:
# https://stackoverflow.com/questions/27697218/python-regex-search-for-hexadecimal-bytes
for match_head in reg_head.finditer(b):
# The offset is the start of the pattern match
offset = match_head.start()
head_skip = False
# If we have already carved a file using this header offset,
# then skip this offset (since it can't be valid)
if offset in headers:
head_skip = True # Indicates this is not a valid header offset and must be skipped
# Get the contents of the file from the header offset to the end of file
start = b[offset:]
# If the file type is a pdf, the next pdf header (if any must)
# must be found so that the correct EOF is used
next_offset = 0
if sig[0] == '.pdf' and head_skip is False:
# Finds offset of next header match
for match in reg_head.finditer(b[offset+1:]):
next_offset = match.start() + offset
break
# We only need to find a footer if the header is valid,
# which means the head_skip flag must be false
if head_skip is False:
# If this file type has a footer value,
# then find the footer
if sig[2] is not None:
# Compile a regex pattern using the hex bytes for the footer
reg_foot = re.compile(sig[2])
# Iterate through all matches for the footer pattern
# Only seraching from the current header offset and forward
for match_foot in reg_foot.finditer(start):
# The end offset is the end of the footer match
end = match_foot.end()
# Add the header offset to get the true offset within the image
end += offset
# Indicates whether or not the current pdf footer value is valid
pdf_skip = False
# Tracks the next footer offset
next_end = 0
# If the file type is pdf, the end offset must be verified,
# since pdf's can have multiple EOFs.
#
# The correct EOF will be the last footer match that does not
# go past the next pdf header start offset.
#
# If there are no more pdf headers, then the last footer in the
# iterator will be used.
if sig[0] == '.pdf':
# Find next match for the footer pattern
# Only searches from the current footer offset and forward
for match in reg_foot.finditer(b[end:]):
next_end = match.start() + end
break
# If next_offset is not 0, then there
# is another pdf header match in the file
if next_offset != 0:
# If the current footer offset is greater than the
# next pdf header start, this can't be a valid offset.
#
# So, the pdf_skip flag is set to True and the footer match
# loop is exited.
if end > next_offset:
pdf_skip = True
break
# If there is another footer match, check if the next
# footer is past the start of the next pdf header.
#
# If the next footer match is past the start of the
# next pdf header, then the current footer offset must be the
# valid EOF.
#
# So, exit the footer match for loop.
elif next_end != 0:
if next_end > next_offset:
break
# Add extra 18 bytes after footer for docx
elif sig[0] == '.docx':
end += 18
break
# If the file type is not pdf, then only
# the first footer match is needed.
#
# So, exit the footer match for loop.
else:
break
# If the file type does not have a footer,
# the file size must be calculated
else:
# bmp file size is located 2 bytes from start of file
if sig[0] == '.bmp':
head = 2
# avi file size is located 4 bytes from start of file
elif sig[0] == '.avi':
head = 4
# Add the offset for the file size to start of header match
size_start = offset + head
# File size is 4 bytes
# Read each byte individually, convert to hex, chop of the '0x', fill with zeroes
# so there is always atleast 2 hex numbers per byte.
# Convert the hex numbers to a string and concatenate the strings together.
size = str(hex(b[size_start])[2:].zfill(2)) + str(hex(b[size_start+1])[2:]).zfill(2) + str(hex(b[size_start+2])[2:].zfill(2)) + str(hex(b[size_start+3])[2:].zfill(2))
# Convert the hex string to bytes
size_b = binascii.unhexlify(size)
# Convert the little endian bytes to a long
# The '<' indicates little endian, and the 'l' indicates a long
long_size = struct.unpack('<l', size_b)
# Add the file size to the header offset
end = offset + long_size[0]
# For avi files, 8 bytes must be added to account for the
# header and file size bytes
if sig[0] == '.avi':
end += 8
# If we have already carved a file using this footer offset,
# then skip this offset (since it can't be valid)
foot_skip = False
if end in footers:
foot_skip = True # Indicates this is not a valid footer offset and must be skipped
# If all skip flags are False, then the offsets are valid
# So, carve the file, find the hash, and print file info
if not (head_skip or foot_skip or pdf_skip):
# Add the header and footer offsets to their lists
headers.append(offset)
footers.append(end)
# Write the data from the header offset to the footer offset to a new file
newfile = b[offset:end]
# File name is controlled by the file counter and the
# file extension specified in the signature
name = 'file' + str(count) + sig[0]
file_out = open(name, "wb")
file_out.write(newfile)
file_out.close()
# Get sha256 hash of file
file_hash = sha256_hash(name)
# Increment file counter
count += 1
# Print file info
print("\nFile Name: " + name)
print("Starting Offset: " + hex(offset))
print("End Offset: " + hex(end))
print("SHA-256 Hash: " + file_hash)
# Scans through the file and generates the SHA-256 hash
# in chunks of 64K using the hashlib function
# Source:
# https://stackoverflow.com/questions/22058048/hashing-a-file-in-python
def sha256_hash(file):
with open(file, "rb") as hashfile:
data = hashfile.read(BUF_SIZE)
hasher = hashlib.sha256(data)
while data:
data = hashfile.read(BUF_SIZE)
hasher.update(data)
return hasher.hexdigest()
if __name__ == "__main__":
main()