-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdb_initial.py
More file actions
197 lines (160 loc) · 8.92 KB
/
db_initial.py
File metadata and controls
197 lines (160 loc) · 8.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#Initializes the database by downloading/converting files from Pub Safe and then populating the database with the extracted information. It then dumps the JSON from the database for use by incident_generator_class.py.
#Written for RPI Incident Map
import urllib2, sys, subprocess, os, datetime, re, geocoder, json, pytest, urlparse, lxml.html
from pymongo import MongoClient
from mapbox import Geocoder
from bson import Binary, Code, json_util
from bson.json_util import dumps
#Since Pub Safe uses the first three letters of the month (capitalized) followed by an _ and then the last two digits of the year for the .pdf naming convention. I use __dateFormat to hold that string for the current month
__dateFormat = datetime.datetime.now().strftime("%b").upper() + "_" + str(datetime.datetime.now().year % 100)
#Connect to the proper set of posts in the incident database's collection.
__connection = MongoClient()
__db = __connection["incident-db"]
__collection = __db["incident-collection"]
__posts = __db.posts
#Download the .pdf for the current month from Pub Safe by opening the URL and dumping it into a PDF. Then use Ghost Script (gs) to convert it to a text file
def downloadAndConvertFile(download_url):
filePath = download_url.replace("http://www.rpi.edu/dept/public_safety/blotter/", "").replace(".pdf", "")
response = urllib2.urlopen(download_url)
file = open("./pdfs/{fp}.pdf".format(fp = filePath), "wb")
file.write(response.read())
file.close()
os.system("gs -sDEVICE=txtwrite -o ./pdfs/{fp}.txt ./pdfs/{fp}.pdf 1> /dev/null".format(fp = filePath))
return 0
def aquireBacklog():
#The url of the page you want to scrape
baseURL = "http://www.rpi.edu/dept/public_safety/blotter/"
#Fetch the page
res = urllib2.urlopen(baseURL)
#Parse the response into an xml tree
tree = lxml.html.fromstring(res.read())
#Construct a namespace dictionary to pass to the xpath() call
#This lets us use regular expressions in the xpath
ns = {"re": "http://exslt.org/regular-expressions"}
#Iterate over all <a> tags whose href ends in ".pdf" (case-insensitive)
for node in tree.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns):
#Save the href, joining it to the baseURL as well as the file name to check and see if it exists
scrapedURL = urlparse.urljoin(baseURL, node.attrib["href"])
fileName = node.attrib["href"]
if not os.path.exists("./pdfs/{fp}".format(fp = fileName)):
downloadAndConvertFile(scrapedURL)
return 0
#Allows the creation of one month's JSON file for Kit's code. It takes in a string of the format "XXX_YY" with no extension. It must also be a string. Example: DEC_16
def createBacklogMonth(aDate):
if not os.path.exists("./pdfs/{fp}.json".format(fp = aDate)):
createDatabase(aDate)
dumpJSON(aDate)
return 0
#Allows the creation of an entire year's worth of JSONs for Kit's code. Take in an integer in the format XXXX. Example: 2016
def createBacklogYear(aYear):
for i in range(1,13):
date = datetime.date(aYear, i, 1).strftime("%b").upper() + "_" + str(aYear % 100)
if not os.path.exists("./pdfs/{fp}.json".format(fp = date)):
result = __posts.delete_many({})
createDatabase(date)
dumpJSON(date)
return 0
def createDatabase(fileName):
file = open("./pdfs/{fn}.txt".format(fn = fileName), "r")
reportNum = []; dateReported = []; location = []; eventNum = []; dateTimeFromTo = []; incident = []; disposition = []; coords = []; month = "";
seenDisposition = False
for line in file: #For every line in the text file
#Every if statement uses regular expressions to locate the desired field and then records whatever comes after the field up to where the question mark is (or the end of the line if not question mark)
#Basically, if what it finds isn't null then write what it (it being re.findall()) finds to a variable
if re.findall(r"date reported:.* (?=location)", line):
dateReported = "".join(re.findall(r"date reported:.* (?=location)", line))
dateReported = dateReported.replace("date reported:","").strip().rstrip()
month = dateReported[:2]
if re.findall(r"location :.* (?=event)", line):
location = ''.join(re.findall(r"location :.* (?=event)", line))
location = location.replace("location :","").strip()
#Using the location's name, use Mapbox's API to find it's latitude and longitude
geocoder = Geocoder(access_token="pk.eyJ1Ijoic3NjaGF0dHMiLCJhIjoiY2l1NDdib3N1MGl2MTJwbGhycnNqNGYxciJ9.uCMQ9n7xQCjRvRMnmFrLrw")
response = geocoder.forward("{loc}, Troy, New York 12180, United States".format(loc = location))
first = response.geojson()["features"][0]
coords = [first["geometry"]["coordinates"][1], first["geometry"]["coordinates"][0]]
if re.findall(r"event #:.*", line):
eventNum = "".join(re.findall(r"event #:.*", line))
eventNum = eventNum.replace("event #:","").strip().rstrip()
if re.findall(r"date and time occurred from - occurred to:.*", line):
dateTimeFromTo = "".join(re.findall(r"date and time occurred from - occurred to:.*", line))
dateTimeFromTo = dateTimeFromTo.replace("date and time occurred from - occurred to:","").strip().rstrip()
if re.findall(r"incident :.* (?=report #:)", line):
incident = "".join(re.findall(r"incident :.* (?=report #:)", line))
incident = incident.replace("incident :","").strip()
if re.findall(r"report #:.*", line):
reportNum = "".join(re.findall(r"report #:.*", line))
reportNum = reportNum.replace("report #:","").strip()
if re.findall(r"disposition:.*", line):
disposition = "".join(re.findall(r"disposition:.*", line))
disposition = disposition.replace("disposition: :","").strip().rstrip()
seenDisposition = True
if seenDisposition:
#Write the formatted information into a properly formatted post for the MongoDB
post = {"date reported": dateReported,
"month": month,
"location": location,
"event #": eventNum,
"date and time occurred from to occurred to": dateTimeFromTo,
"incident": incident,
"report #": reportNum,
"disposition": disposition,
"coordinates": coords}
#Insert the post into the database's collection's posts
__posts.insert(post)
seenDisposition = False
return 0
#Dumps the posts from the database collection into a json file named with the month and year i.e. NOV_16
def dumpJSON(fileName):
f = open("./pdfs/{fn}.json".format(fn = fileName), "w+")
#Gets a list of all of the posts that are in the database. Each post is an incident.
docsList = list(__posts.find())
#print docsList
#print fileName
#Creates a dump of the posts.
jsonDocs = json.dumps(docsList, default=json_util.default, indent=4, separators=(",", ": "))
#Writes the dumps to a JSON file
f.write(jsonDocs)
f.close()
return 0
#Returns the filename of the JSON created in the function above along with the month and year in number, number tuple. Needed for part of Kit's code.
def filename():
year = datetime.datetime.now().year
month = datetime.datetime.now().month
filename = __dateFormat
desireable = "./pdfs/{fn}.json".format(fn=filename)
return desireable
#Assert statements to test that all functions run and return the proper result
def testDownloadAndConvertFile():
newpath = r"./pdfs"
if not os.path.exists(newpath):
os.makedirs(newpath)
assert downloadAndConvertFile("http://www.rpi.edu/dept/public_safety/blotter/{fn}.pdf".format(fn = __dateFormat)) == 0
def testCreateDatabase():
assert createDatabase("DEC_16") == 0
def testDumpJSON():
assert dumpJSON("DEC_16") == 0
def testFilename():
assert isinstance(filename(), str)
def testAquireBacklog():
newpath = r"./pdfs"
if not os.path.exists(newpath):
os.makedirs(newpath)
assert aquireBacklog() == 0
def testCreateBacklogYear():
assert createBacklogYear(2016) == 0
def testCreateBacklogMonth():
assert createBacklogMonth("DEC_16") == 0
#How another file can populate the database and dump the JSON
def runDB():
newpath = r"./pdfs"
if not os.path.exists(newpath):
os.makedirs(newpath)
aquireBacklog()
createBacklogYear(2016)
downloadAndConvertFile("http://www.rpi.edu/dept/public_safety/blotter/{fn}.pdf".format(fn = __dateFormat))
createDatabase(__dateFormat)
dumpJSON(__dateFormat)
return 0
if __name__ == "__main__":
runDB()