-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprototype.py
More file actions
125 lines (100 loc) · 4.06 KB
/
prototype.py
File metadata and controls
125 lines (100 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from bs4 import BeautifulSoup
import urllib
import requests
import soupsieve as sv
import codecs
import csv
import io
import re
import numpy as np
import pandas as pd
f1 = open('out5.csv', 'rt')
csv_reader = csv.reader(f1, escapechar='\\')
#q = input("what name?")
for row in csv_reader:
html_doc = urllib.request.urlopen(row[0]).read()
soup = BeautifulSoup(html_doc, 'html.parser')
holding = []
for string in soup.strings:
holding.append(string)
try:
index1 = holding.index('Inmate Mailing Address:')
except ValueError:
print(row[0]+" Can't find inmate mailing address. Check me!")
break
index2 = index1 + 2 #this should refer to facility name
index3 = index2 + 1 #this should refer to street address
index4 = index3 + 1 #this should refer to city state and zip
address = holding[index4]
encoded_zip = address.encode("ascii", "ignore")
decoded_zip = encoded_zip.decode()
try:
actual_zip = re.match('^.*(?P<zipcode>\d{5}).*$', decoded_zip).groupdict()['zipcode'] #now we have zip code extracted from address
except AttributeError:
actual_zip = 'FIND ME'
print(row[0]+" Zipcode issue, check me.")
continue
actual_name = holding[index2]
encoded_name = actual_name.encode("ascii", "ignore")
decoded_name = encoded_name.decode()
actual_address = holding[index3]
encoded_address = actual_address.encode("ascii", "ignore")
decoded_address = encoded_address.decode()
#below 3 statements are for trying to put names of facilities in easy to checkk list
column_names = ["name", "address", "zip"]
df = pd.read_csv("customers4.csv", names=column_names)
namings = df.name.to_list()
if holding[index2] not in namings: #remove this in next iteration, writing list just once should stop dupe entries
with open("customers5.csv", "a", newline="") as f:
fieldnames = ['name', 'address', 'zip']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow({'name': decoded_name, 'address': decoded_address, 'zip': actual_zip})
else:
continue
#html_doc = urllib.request.urlopen('https://www.prisonpro.com/content/eastern-correctional-institution-annex').read()
#soup = BeautifulSoup(html_doc, 'html.parser')
#print(soup.prettify())
#head_tag = soup.head
#print(head_tag.contents)
#title_tag = head_tag.contents[0]
#print(title_tag)
#for child in head_tag.descendants:
#print(child)
#holding = []
#f = codecs.open('temptext.txt', encoding='utf-8',mode='w+')
#for string in soup.strings:
#holding.append(string)
#print(holding)
#print(len(holding))
#index1 = holding.index('Inmate Mailing Address:')
#print(index1)
#index2 = index1 + 2 #this should refer to facility name
#index3 = index2 + 1 #this should refer to street address
#index4 = index3 + 1 #this should refer to city state and zip
#print(holding[57]) #this is index above of reference point in each website
#print(holding[58])
#print(holding[59]) #this is the facility name
#print(index2)
#print(holding[60]) #this is street address
#print(index3)
#print(holding[61]) #this is city state and zip
#print(index4)
print("ok nearly")
#below is so that zip code can be extracted from city state zip entry
#address = holding[index4]
#address = holding[61]
#actual_zip = re.match('^.*(?P<zipcode>\d{5}).*$', address).groupdict()['zipcode']
#print(actual_zip) #now we have zip code extracted from address
#actual_name = holding[59]
#actual_name = holding[index2]
#below is in case string from fac name needs to be stripped of non-ACSII
#encoded_name = actual_name.encode("ascii", "ignore")
#decoded_name = encoded_name.decode()
#print(decoded_name)
#THE BELOW APPENDS TO CSV WOOOOOOT, now also writes to every line with no blank in between
#file = open('customers3.csv', 'a', newline='')
#fieldnames = ['name', 'address', 'zip']
#writer = csv.DictWriter(file, fieldnames=fieldnames)
#writer.writeheader()
#writer.writerow({'name': decoded_name, 'address': holding[index3], 'zip': actual_zip})
#with open("out4.csv", "a", newline="") as f: