Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added src/__pycache__/clean_data.cpython-312.pyc
Binary file not shown.
Binary file added src/__pycache__/evaluate.cpython-312.pyc
Binary file not shown.
Binary file added src/__pycache__/load_data.cpython-312.pyc
Binary file not shown.
26 changes: 21 additions & 5 deletions src/clean_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
import pandas as pd
from load_data import load_csv
import numpy as np

def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean sensor data by handling missing or invalid values.
# Convert to numeric, set invalids to NaN
df["pH"] = pd.to_numeric(df["pH"], errors="coerce")
df["turbidity"] = pd.to_numeric(df["turbidity"], errors="coerce")

# Replace invalid values with NaN
df.loc[(df["pH"] <= 0) | (df["pH"] > 14), "pH"] = np.nan
df.loc[df["turbidity"] < 0, "turbidity"] = np.nan

# Drop rows with NaNs in critical columns
initial_len = len(df)
df.dropna(subset=["pH", "turbidity"], inplace=True)
cleaned_len = len(df)

print(f"✅ Cleaned sensor data: {initial_len - cleaned_len} invalid rows removed.")

return df


Returns:
pd.DataFrame: Cleaned data.
"""
20 changes: 19 additions & 1 deletion src/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
import pandas as pd

class WaterQualityEvaluator:
def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
self.ph_range = ph_range
self.ph_min, self.ph_max = ph_range
self.turbidity_threshold = turbidity_threshold

def is_safe(self, row: pd.Series) -> bool:
"""
Determine if a row of water data is safe.
Returns True if both pH and turbidity are within safe limits.
"""
ph = row['pH']
turb = row['turbidity']
return self.ph_min <= ph <= self.ph_max and turb <= self.turbidity_threshold

def evaluate(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Adds 'ph_status', 'turbidity_status', and 'overall_safety' columns to the DataFrame.
"""
df = df.copy()

df["ph_status"] = df["pH"].apply(lambda x: "✅ Normal" if self.ph_min <= x <= self.ph_max else "⚠️ Out of Range")
df["turbidity_status"] = df["turbidity"].apply(lambda x: "✅ Normal" if x <= self.turbidity_threshold else "⚠️ High Turbidity")
df["overall_safety"] = df.apply(self.is_safe, axis=1)

return df
14 changes: 6 additions & 8 deletions src/load_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
def load_csv(filepath: str) -> pd.DataFrame:
"""
Load sensor data from a CSV file.
import pandas as pd

Args:
filepath (str): Path to the CSV file.
filepath = r"C:\Users\USER\Desktop\codebook\alt_class\water_quality_monitoring\data\sensor_data.csv"

Returns:
pd.DataFrame: Loaded data as a pandas DataFrame.
"""
def load_csv(filepath: str) -> pd.DataFrame:
# Load the CSV file and parse the timestamp column as datetime
df = pd.read_csv(filepath, parse_dates=["timestamp"])
return df
17 changes: 17 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from load_data import load_csv
from clean_data import clean_sensor_data
from evaluate import WaterQualityEvaluator

# Load and clean
file_path = r"C:\Users\USER\Desktop\codebook\alt_class\water_quality_monitoring\data\sensor_data.csv"
raw_df = load_csv(file_path)
print(raw_df.columns)
df_cleaned = clean_sensor_data(raw_df)

# Evaluate
evaluator = WaterQualityEvaluator(ph_range=(6.5, 8.5), turbidity_threshold=1.0)
evaluated_df = evaluator.evaluate(df_cleaned)

# Save
evaluated_df.to_csv("processed_data.csv", index=False)
print("✅ Water quality evaluation complete and saved to 'processed_data.csv'.")
1,000 changes: 1,000 additions & 0 deletions src/processed_data.csv

Large diffs are not rendered by default.