This repository was archived by the owner on Feb 22, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_epadata.php
More file actions
executable file
·94 lines (76 loc) · 2.72 KB
/
scrape_epadata.php
File metadata and controls
executable file
·94 lines (76 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/php -q
<?php
//taken from: http://us3.php.net/manual/en/function.preg-replace.php#87058
function string_to_filename($word) {
$tmp = preg_replace('/^\W+|\W+$/', '', $word); // remove all non-alphanumeric chars at begin & end of string
$tmp = preg_replace('/\s+/', '_', $tmp); // compress internal whitespace and replace with _
return strtolower(preg_replace('/\W/', '', $tmp)); // remove all non-alphanumeric chars except _ and -
}
function getFile($fileurl, $file)
{
$fp = fopen($file, "w");
$ch = curl_init();
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_URL, $fileurl);
curl_exec($ch);
curl_close($ch);
fclose($fp);
}
//EPA.gov
//download all of the .exe files
$years = array('2005','2006','2007');
$year = $argv[1];
if(!in_array($year, $years)) {
die('ERROR: Invalid Year - we only support 2005 - 2007'."\n");
}
//deal with patterns for each year
//2005
if($year == "2005") {
$siteurl = 'http://www.epa.gov/tri/tridata/tri05/data/';
$response = file_get_contents($siteurl);
preg_match_all('#<td width="25%"><a href="../../tri05/data/(.*)">#', $response, $urls);
}
//2006
if($year == "2006") {
$siteurl = 'http://www.epa.gov/tri/tridata/tri06/data/';
$response = file_get_contents($siteurl);
preg_match_all('#<td width="25%"><a href="../../tri06/data/(.*)">#', $response, $urls);
}
//2007
if($year == "2007") {
$url = 'http://www.epa.gov/tri/tridata/tri05/data/';
$siteurl = 'http://www.epa.gov/tri/tridata/tri07/data/Statedata07/';
$response = file_get_contents($url);
preg_match_all('#<td width="25%"><a href="../../tri07/data/(.*)">#', $response, $urls);
preg_match_all('#<td width="25%"><a href="Statedata07/(.*)">#', $response, $urls);
}
foreach($urls[1] as $url_str) {
//if we already have the file don't download it again
if(!file_exists($url_str)) {
//TODO: getFile saving file but zip is broken
//getFile($site_url.$url_str, $url_str);
exec('wget '.$siteurl.$url_str);
if(file_exists($url_str)) {
if(!exec('unzip ./'.$url_str)) {
echo 'Error unzipping file';
}
}
}
}
// we got the files and have unzipped them
$files = glob("*.txt");
$_createtables=0;
foreach($files as $file) {
//echo "Working on file: " . $file ."...\n";
$_state = explode("_", $file);
$filedata = file($file);
$state_str = $_state[0];
$open = fopen($state_str."_".$year.$file_type.".csv", "a");
foreach($filedata as $data) {
$data = explode("\t", trim($data));
$data_str = "\"" . implode ('","', $data). "\"\r\n";
$write = fwrite($open, $data_str);
}
fclose($open);
}