dradamski · dradamski · Mar 22, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/data/raw/Harbor_Water_Quality.csv → data/raw/raw_data_2021_04.csv b/data/raw/Harbor_Water_Quality.csv → data/raw/raw_data_2021_04.csv
diff --git a/data/raw/raw_data_2021_11.csv b/data/raw/raw_data_2021_11.csv
diff --git a/notebooks/1.0-dra-data-wrangling.ipynb b/notebooks/1.0-dra-data-wrangling.ipynb
diff --git a/notebooks/2.0-dra-data-exploration.ipynb b/notebooks/2.0-dra-data-exploration.ipynb
diff --git a/notebooks/3.1-dra-indepth-analysis.ipynb b/notebooks/3.1-dra-indepth-analysis.ipynb
diff --git a/notebooks/3.2-dra-indepth-analysis.ipynb b/notebooks/3.2-dra-indepth-analysis.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,7 @@ Sphinx
 coverage
 awscli
 flake8
+numpy
+pandas
 python-dotenv>=0.5.1
+requests
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
diff --git a/src/data/.gitkeep → src/data_wrangling/.gitkeep b/src/data/.gitkeep → src/data_wrangling/.gitkeep
diff --git a/src/data/__init__.py → src/data_wrangling/__init__.py b/src/data/__init__.py → src/data_wrangling/__init__.py
diff --git a/src/data_wrangling/autopull_data.py b/src/data_wrangling/autopull_data.py
@@ -0,0 +1,68 @@
+import csv
+from datetime import datetime
+from glob import glob
+import os
+import pandas as pd
+import re
+import requests
+from sodapy import Socrata
+
+APP_TOKEN = 'wHvXxdmw8Ek59fQThcHly6ulQ'
+DATASET_IDENTIFIER = '5uug-f49n'
+CLIENT = Socrata("data.cityofnewyork.us", APP_TOKEN, timeout=30)
+
+def pull_new_data(
+    client,
+    dataset_id,
+    output="../data/raw/raw_data",
+    query="sample_date > '1979-12-31T00:00:00.000'",
+    limit=100000
+):
+    """
+    Checks to see if a raw file exists and pulls new data to add to it.
+
+    client :
+    """
+
+    for filename in  glob(f"{output}*.csv"):
+        with open(filename) as f:
+            last_line = f.readlines()[-1]
+            max_date = re.search(
+                "([0-9]){4}-([0-9]).*", last_line
+            )[0].split(",")[0]
+        query = f"sample_date > '{max_date}'"
+
+    date = datetime.now().strftime("_%Y_%m.csv")
+    output = output + date
+
+    if os.path.exists(output):
+        return "Data has alread been pulled this month. Try again later."
+
+    with open(output, "w", newline="") as f:
+        for page in client.get_all(
+            dataset_id, content_type='csv',
+            where=query,
+            limit=limit):
+
+            writer = csv.writer(f)
+            writer.writerows([page])
+
+def combine_old_and_new_data(
+    old_data,
+    new_data,
+    output_filename=None
+):
+
+    combined_csv = pd.concat(
+        [pd.read_csv(old_data), pd.read_csv(new_data)]
+    )
+    if not output_filename:
+        combined_csv.to_csv(new_data)
+    else:
+        combined_csv.to_csv(output_filename)
+
+    if os.path.exists(old_data):
+        os.remove(old_data)
+
+        # Print the statement once the file is deleted
+        print("The file: {} is deleted!".format(old_data))
diff --git a/src/data/clean_dataset.py → src/data_wrangling/clean_dataset.py b/src/data/clean_dataset.py → src/data_wrangling/clean_dataset.py
@@ -11,7 +11,7 @@
 import os
 import pandas as pd
 
-path = os.path.expanduser('~/Projects/capstone-two/data/raw/Harbor_Water_Quality.csv')
+path = os.path.expanduser('../../data/raw/Harbor_Water_Quality.csv')
 df = pd.read_csv(path, parse_dates=['Sample Date', 'Sample Time'])
 
 
@@ -20,12 +20,12 @@
 survey_stations = ['K1', 'K2', 'K3', 'K4', 'K5', 'K5A', 'K6',
                   'N1', 'N3B', 'N4', 'N5', 'N6', 'G2', 'N7', 'N8',
                   'N9', 'N16', 'NR1', 'E2', 'E4', 'E6', 'E7', 'E8',
-                  'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'J1', 
+                  'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'J1',
                   'J2', 'J3', 'J5', 'J7', 'J8', 'J9A', 'J10', 'J11',
                   'J12', 'JA1', 'N9A', 'H3', 'J14', 'J16', 'AC1',
                   'AC1', 'AC2', 'BB2', 'BB4', 'BR1', 'BR3', 'BR5',
                   'CIC2', 'CIC3', 'F1', 'F5', 'FB1', 'FLC1', 'FLC2',
-                  'GB1', 'GC3', 'GC4', 'GC5', 'GC6', 'HC1', 'HC2', 
+                  'GB1', 'GC3', 'GC4', 'GC5', 'GC6', 'HC1', 'HC2',
                   'HC3', 'HR1', 'HR2', 'HR03', 'LN1', 'NC0', 'NC1',
                   'NC2', 'NC3', 'PB2', 'PB3', 'SP1', 'SP2', 'WC1',
                   'WC2', 'WC3'
@@ -46,13 +46,13 @@
         df = df.drop(columns=col)
     elif len(df[df[col].notnull()]) < 10000:
         df = df.drop(columns=col)
-        
-df = df.drop(['Current Direction (Current Direction)', 
+
+df = df.drop(['Current Direction (Current Direction)',
               'Wind Direction (Wind Direction)',
-              'Current Speed (knot)', 'Wind Speed (mph)', 
+              'Current Speed (knot)', 'Wind Speed (mph)',
               'Sea State ', 'Type',
               'Enterococcus Top Sample Less Than or Greater Than Result'
-              ], 
+              ],
               axis=1
             )
 
@@ -66,10 +66,10 @@
 
 # Check which columns are numeric and create a list of object columns
 obj_cols = []
-ok_obj_cols = ['Sampling Location', 'Sample Date', 'Sample Time', 
+ok_obj_cols = ['Sampling Location', 'Sample Date', 'Sample Time',
                'Weather Condition (Dry or Wet)'
                ]
-               
+
 for col in df:
     if col not in ok_obj_cols:
         try:
@@ -83,7 +83,7 @@
 
 # Fix Secchi Disk
 df['Secchi Depth (ft)'] = df['Secchi Depth (ft)'].replace('3..5', '3.5')
-      
+
 # Fix Bottom PH
 df['Bottom PH'] = df['Bottom PH'].replace('N', np.nan)
 
@@ -138,7 +138,7 @@ def drop_less(string):
 
 less_than_cols = ['Top Nitrate/Nitrite (mg/L)', 'Top Ammonium (mg/L)',
                  'Top Ortho-Phosphorus (mg/L)', 'Top Silica (mg/L)',
-                  'Total Phosphorus(mg/L)' 
+                  'Total Phosphorus(mg/L)'
                  ]
 
 for col in less_than_cols:
@@ -155,7 +155,7 @@ def drop_less(string):
         pd.to_numeric(i)
     except:
         df['Long'] = df['Long'].replace(i, np.nan)
-        
+
 # Fix Latitude and Longitude mistakes
 for i, lat in enumerate(df['Lat']):
     try:
@@ -191,27 +191,27 @@ def col_strip(string):
         df.at[i, 'Lat'] = lat
         df.at[i, 'Long'] = long
 
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
-        
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 # Save df to csv file
 outpath = os.path.expanduser(
     '~/Projects/capstone-two/data/processed/Clean_Harbor_Water_Quality.csv'
     )
 
-df.to_csv(outpath)
+df.to_csv(outpath)
diff --git a/tests/data_wrangling/test_autopull_data.py b/tests/data_wrangling/test_autopull_data.py
@@ -0,0 +1,7 @@
+import mock
+import pytest
+
+from data_wrangling import autopull_data
+
+
+# patch data pull    
diff --git a/tox.ini b/tox.ini
@@ -0,0 +1,12 @@
+# tox (https://tox.readthedocs.io/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py37
+
+[testenv]
+deps =
+
+commands = python -m unittest discover