SamDewriter · Manny-hub · Jun 14, 2025
diff --git a/src/__pycache__/clean_data.cpython-312.pyc b/src/__pycache__/clean_data.cpython-312.pyc
diff --git a/src/__pycache__/evaluate.cpython-312.pyc b/src/__pycache__/evaluate.cpython-312.pyc
diff --git a/src/__pycache__/load_data.cpython-312.pyc b/src/__pycache__/load_data.cpython-312.pyc
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -1,7 +1,23 @@
+import pandas as pd
+from load_data import load_csv  
+import numpy as np
+
 def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Clean sensor data by handling missing or invalid values.
+    # Convert to numeric, set invalids to NaN
+    df["pH"] = pd.to_numeric(df["pH"], errors="coerce")
+    df["turbidity"] = pd.to_numeric(df["turbidity"], errors="coerce")
+
+    # Replace invalid values with NaN
+    df.loc[(df["pH"] <= 0) | (df["pH"] > 14), "pH"] = np.nan
+    df.loc[df["turbidity"] < 0, "turbidity"] = np.nan
+
+    # Drop rows with NaNs in critical columns
+    initial_len = len(df)
+    df.dropna(subset=["pH", "turbidity"], inplace=True)
+    cleaned_len = len(df)
+
+    print(f"✅ Cleaned sensor data: {initial_len - cleaned_len} invalid rows removed.")
+
+    return df
+
 
-    Returns:
-        pd.DataFrame: Cleaned data.
-    """
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -1,9 +1,27 @@
+import pandas as pd
+
 class WaterQualityEvaluator:
     def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
-        self.ph_range = ph_range
+        self.ph_min, self.ph_max = ph_range
         self.turbidity_threshold = turbidity_threshold
 
     def is_safe(self, row: pd.Series) -> bool:
         """
         Determine if a row of water data is safe.
+        Returns True if both pH and turbidity are within safe limits.
+        """
+        ph = row['pH']
+        turb = row['turbidity']
+        return self.ph_min <= ph <= self.ph_max and turb <= self.turbidity_threshold
+
+    def evaluate(self, df: pd.DataFrame) -> pd.DataFrame:
         """
+        Adds 'ph_status', 'turbidity_status', and 'overall_safety' columns to the DataFrame.
+        """
+        df = df.copy()
+
+        df["ph_status"] = df["pH"].apply(lambda x: "✅ Normal" if self.ph_min <= x <= self.ph_max else "⚠️ Out of Range")
+        df["turbidity_status"] = df["turbidity"].apply(lambda x: "✅ Normal" if x <= self.turbidity_threshold else "⚠️ High Turbidity")
+        df["overall_safety"] = df.apply(self.is_safe, axis=1)
+
+        return df
diff --git a/src/load_data.py b/src/load_data.py
@@ -1,10 +1,8 @@
-def load_csv(filepath: str) -> pd.DataFrame:
-    """
-    Load sensor data from a CSV file.
+import pandas as pd 
 
-    Args:
-        filepath (str): Path to the CSV file.
+filepath = r"C:\Users\USER\Desktop\codebook\alt_class\water_quality_monitoring\data\sensor_data.csv" 
 
-    Returns:
-        pd.DataFrame: Loaded data as a pandas DataFrame.
-    """
+def load_csv(filepath: str) -> pd.DataFrame:
+    # Load the CSV file and parse the timestamp column as datetime
+    df = pd.read_csv(filepath, parse_dates=["timestamp"])
+    return df
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,17 @@
+from load_data import load_csv
+from clean_data import clean_sensor_data
+from evaluate import WaterQualityEvaluator  
+
+# Load and clean
+file_path = r"C:\Users\USER\Desktop\codebook\alt_class\water_quality_monitoring\data\sensor_data.csv"
+raw_df = load_csv(file_path)
+print(raw_df.columns)
+df_cleaned = clean_sensor_data(raw_df)
+
+# Evaluate
+evaluator = WaterQualityEvaluator(ph_range=(6.5, 8.5), turbidity_threshold=1.0)
+evaluated_df = evaluator.evaluate(df_cleaned)
+
+# Save
+evaluated_df.to_csv("processed_data.csv", index=False)
+print("✅ Water quality evaluation complete and saved to 'processed_data.csv'.")
diff --git a/src/processed_data.csv b/src/processed_data.csv