-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexport.py
More file actions
118 lines (73 loc) · 3.31 KB
/
export.py
File metadata and controls
118 lines (73 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sqlite3
import sys
import os
import urllib.parse
# Export the urls of a given bucket (stored within the saved db) to several txt files
class awsExport:
URLS_PER_FILE = 50000
def __init__(self):
self.bucketID = None
self.writeDir = None
self.expFilter = None
self.initDatabase()
self.getBucketToExport()
self.initWriteDirectory()
self.initFilter()
self.exportBucket()
def initDatabase(self):
self.db = "test.db"
self.con = sqlite3.connect(self.db)
self.cur = self.con.cursor()
def getBucketToExport(self):
bucketInput = input("Bucket to export:")
self.cur.execute("SELECT * FROM Buckets WHERE bucket=?", [bucketInput])
bucketExist = self.cur.fetchall()
if (len(bucketExist) > 0 and bucketExist[0][1] == bucketInput):
self.bucketID = bucketExist[0][0]
self.bucketUrl = bucketExist[0][1]
print("found bucket!")
else:
sys.exit(f"The bucket '{bucketInput}' was not found in the db")
def initWriteDirectory(self):
self.writeDir = input("Input the folder to write the urls:")
def initFilter(self):
filter = input("Type in any filter for the export. If you don't want to filter, press enter:")
if (filter != ""):
self.expFilter = filter
def writeUrls(self, urlArray, page):
directory = self.writeDir + "/"
writeFile = f"{directory}{self.writeDir}_{page}.txt"
if not os.path.exists(directory):
os.makedirs(directory)
file = open(writeFile, "a")
for x in urlArray:
file.write(self.bucketUrl + urllib.parse.quote(x) + "\n")
file.close()
print("Wrote page: " + str(page))
def exportBucket(self):
if (self.bucketID == None):
sys.exit("A bucket url is needed to be exported")
print("preparing to export")
page = 0
pageCount = self.URLS_PER_FILE
while pageCount == self.URLS_PER_FILE:
offset = page * self.URLS_PER_FILE
urlArray = []
# prepare query
if (self.expFilter != None):
query = 'SELECT keyUrl FROM Keys WHERE bucketID=? AND keyUrl LIKE "%'+self.expFilter+'%" LIMIT ? OFFSET ?'
queryParams = [self.bucketID, self.URLS_PER_FILE, offset]
else:
query = 'SELECT keyUrl FROM Keys WHERE bucketID=? LIMIT ? OFFSET ?'
queryParams = [self.bucketID, self.URLS_PER_FILE, offset]
# Retrieve key list
self.cur.execute(query, queryParams)
keyData = self.cur.fetchall()
pageCount = len(keyData)
# Write keys
for x in keyData:
urlArray.append(x[0])
self.writeUrls(urlArray, page)
page += 1
print("Finished writing urls!")
awsExport()