Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144,907 changes: 56,540 additions & 88,367 deletions data/raw/Harbor_Water_Quality.csv → data/raw/raw_data_2021_04.csv

Large diffs are not rendered by default.

842 changes: 842 additions & 0 deletions data/raw/raw_data_2021_11.csv

Large diffs are not rendered by default.

1,641 changes: 1,072 additions & 569 deletions notebooks/1.0-dra-data-wrangling.ipynb

Large diffs are not rendered by default.

1,047 changes: 939 additions & 108 deletions notebooks/2.0-dra-data-exploration.ipynb

Large diffs are not rendered by default.

475 changes: 281 additions & 194 deletions notebooks/3.1-dra-indepth-analysis.ipynb

Large diffs are not rendered by default.

11,549 changes: 9,369 additions & 2,180 deletions notebooks/3.2-dra-indepth-analysis.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,7 @@ Sphinx
coverage
awscli
flake8
numpy
pandas
python-dotenv>=0.5.1
requests
30 changes: 0 additions & 30 deletions src/data/make_dataset.py

This file was deleted.

File renamed without changes.
File renamed without changes.
68 changes: 68 additions & 0 deletions src/data_wrangling/autopull_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import csv
from datetime import datetime
from glob import glob
import os
import pandas as pd
import re
import requests
from sodapy import Socrata

APP_TOKEN = 'wHvXxdmw8Ek59fQThcHly6ulQ'
DATASET_IDENTIFIER = '5uug-f49n'
CLIENT = Socrata("data.cityofnewyork.us", APP_TOKEN, timeout=30)

def pull_new_data(
client,
dataset_id,
output="../data/raw/raw_data",
query="sample_date > '1979-12-31T00:00:00.000'",
limit=100000
):
"""
Checks to see if a raw file exists and pulls new data to add to it.

client :
"""

for filename in glob(f"{output}*.csv"):
with open(filename) as f:
last_line = f.readlines()[-1]
max_date = re.search(
"([0-9]){4}-([0-9]).*", last_line
)[0].split(",")[0]
query = f"sample_date > '{max_date}'"

date = datetime.now().strftime("_%Y_%m.csv")
output = output + date

if os.path.exists(output):
return "Data has alread been pulled this month. Try again later."

with open(output, "w", newline="") as f:
for page in client.get_all(
dataset_id, content_type='csv',
where=query,
limit=limit):

writer = csv.writer(f)
writer.writerows([page])

def combine_old_and_new_data(
old_data,
new_data,
output_filename=None
):

combined_csv = pd.concat(
[pd.read_csv(old_data), pd.read_csv(new_data)]
)
if not output_filename:
combined_csv.to_csv(new_data)
else:
combined_csv.to_csv(output_filename)

if os.path.exists(old_data):
os.remove(old_data)

# Print the statement once the file is deleted
print("The file: {} is deleted!".format(old_data))
60 changes: 30 additions & 30 deletions src/data/clean_dataset.py → src/data_wrangling/clean_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import os
import pandas as pd

path = os.path.expanduser('~/Projects/capstone-two/data/raw/Harbor_Water_Quality.csv')
path = os.path.expanduser('../../data/raw/Harbor_Water_Quality.csv')
df = pd.read_csv(path, parse_dates=['Sample Date', 'Sample Time'])


Expand All @@ -20,12 +20,12 @@
survey_stations = ['K1', 'K2', 'K3', 'K4', 'K5', 'K5A', 'K6',
'N1', 'N3B', 'N4', 'N5', 'N6', 'G2', 'N7', 'N8',
'N9', 'N16', 'NR1', 'E2', 'E4', 'E6', 'E7', 'E8',
'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'J1',
'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'J1',
'J2', 'J3', 'J5', 'J7', 'J8', 'J9A', 'J10', 'J11',
'J12', 'JA1', 'N9A', 'H3', 'J14', 'J16', 'AC1',
'AC1', 'AC2', 'BB2', 'BB4', 'BR1', 'BR3', 'BR5',
'CIC2', 'CIC3', 'F1', 'F5', 'FB1', 'FLC1', 'FLC2',
'GB1', 'GC3', 'GC4', 'GC5', 'GC6', 'HC1', 'HC2',
'GB1', 'GC3', 'GC4', 'GC5', 'GC6', 'HC1', 'HC2',
'HC3', 'HR1', 'HR2', 'HR03', 'LN1', 'NC0', 'NC1',
'NC2', 'NC3', 'PB2', 'PB3', 'SP1', 'SP2', 'WC1',
'WC2', 'WC3'
Expand All @@ -46,13 +46,13 @@
df = df.drop(columns=col)
elif len(df[df[col].notnull()]) < 10000:
df = df.drop(columns=col)
df = df.drop(['Current Direction (Current Direction)',

df = df.drop(['Current Direction (Current Direction)',
'Wind Direction (Wind Direction)',
'Current Speed (knot)', 'Wind Speed (mph)',
'Current Speed (knot)', 'Wind Speed (mph)',
'Sea State ', 'Type',
'Enterococcus Top Sample Less Than or Greater Than Result'
],
],
axis=1
)

Expand All @@ -66,10 +66,10 @@

# Check which columns are numeric and create a list of object columns
obj_cols = []
ok_obj_cols = ['Sampling Location', 'Sample Date', 'Sample Time',
ok_obj_cols = ['Sampling Location', 'Sample Date', 'Sample Time',
'Weather Condition (Dry or Wet)'
]

for col in df:
if col not in ok_obj_cols:
try:
Expand All @@ -83,7 +83,7 @@

# Fix Secchi Disk
df['Secchi Depth (ft)'] = df['Secchi Depth (ft)'].replace('3..5', '3.5')

# Fix Bottom PH
df['Bottom PH'] = df['Bottom PH'].replace('N', np.nan)

Expand Down Expand Up @@ -138,7 +138,7 @@ def drop_less(string):

less_than_cols = ['Top Nitrate/Nitrite (mg/L)', 'Top Ammonium (mg/L)',
'Top Ortho-Phosphorus (mg/L)', 'Top Silica (mg/L)',
'Total Phosphorus(mg/L)'
'Total Phosphorus(mg/L)'
]

for col in less_than_cols:
Expand All @@ -155,7 +155,7 @@ def drop_less(string):
pd.to_numeric(i)
except:
df['Long'] = df['Long'].replace(i, np.nan)

# Fix Latitude and Longitude mistakes
for i, lat in enumerate(df['Lat']):
try:
Expand Down Expand Up @@ -191,27 +191,27 @@ def col_strip(string):
df.at[i, 'Lat'] = lat
df.at[i, 'Long'] = long



















# Save df to csv file
outpath = os.path.expanduser(
'~/Projects/capstone-two/data/processed/Clean_Harbor_Water_Quality.csv'
)

df.to_csv(outpath)
df.to_csv(outpath)
7 changes: 7 additions & 0 deletions tests/data_wrangling/test_autopull_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import mock
import pytest

from data_wrangling import autopull_data


# patch data pull
12 changes: 12 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# tox (https://tox.readthedocs.io/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.

[tox]
envlist = py37

[testenv]
deps =

commands = python -m unittest discover