Skip to content

Commit 0615283

Browse files
committed
added a DataCleaner
1 parent 92195d4 commit 0615283

1 file changed

Lines changed: 90 additions & 0 deletions

File tree

src/data_cleaner.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
class DataCleaner:
5+
"""
6+
General-purpose cleaner for multiple WORC datasets
7+
(Employment, Enrollments, Demographics).
8+
9+
Uses try/except for safety (does not break if col missing).
10+
Keeps all rows (no drops), but fills/fixes when possible.
11+
"""
12+
13+
def __init__(self, df: pd.DataFrame):
14+
self.df = df.copy()
15+
16+
def safe_drop_columns(self, cols_to_drop):
17+
"""Drop columns if they exist, otherwise ignore."""
18+
try:
19+
self.df = self.df.drop(columns=cols_to_drop, errors='ignore')
20+
except Exception as e:
21+
print(f"[Warning] Failed dropping columns: {e}")
22+
return self
23+
24+
def safe_fillna(self, fill_map: dict):
25+
"""Fill NaN values for specific columns safely."""
26+
for col, val in fill_map.items():
27+
try:
28+
if col in self.df.columns:
29+
self.df[col] = self.df[col].fillna(val)
30+
except Exception as e:
31+
print(f"[Warning] Failed filling NaN for {col}: {e}")
32+
return self
33+
34+
def safe_replace(self, col, replacements: dict):
35+
"""Replace values in a column safely."""
36+
try:
37+
if col in self.df.columns:
38+
self.df[col] = self.df[col].replace(replacements)
39+
except Exception as e:
40+
print(f"[Warning] Failed replacing values in {col}: {e}")
41+
return self
42+
43+
def safe_convert_dtype(self, col, dtype, errors="ignore"):
44+
"""Convert column dtype safely."""
45+
try:
46+
if col in self.df.columns:
47+
if "datetime" in str(dtype):
48+
self.df[col] = pd.to_datetime(self.df[col], errors="coerce")
49+
else:
50+
self.df[col] = self.df[col].astype(dtype, errors=errors)
51+
except Exception as e:
52+
print(f"[Warning] Failed dtype conversion on {col}: {e}")
53+
return self
54+
55+
def normalize_gender(self):
56+
"""Unify transgender categories safely."""
57+
try:
58+
if "Gender" in self.df.columns:
59+
self.df["Gender"] = self.df["Gender"].replace({
60+
"Transgender male to female": "Transgender",
61+
"Transgender female to male": "Transgender"
62+
})
63+
except Exception as e:
64+
print(f"[Warning] Failed gender normalization: {e}")
65+
return self
66+
67+
def split_race(self):
68+
"""Split Race column into Race_1, Race_2, etc., if it exists."""
69+
try:
70+
if "Race" in self.df.columns:
71+
splitting = self.df["Race"].astype(str).str.split(";", expand=True)
72+
splitting.columns = [f"Race_{i+1}" for i in range(splitting.shape[1])]
73+
self.df = pd.concat([self.df.drop(columns=["Race"]), splitting], axis=1)
74+
except Exception as e:
75+
print(f"[Warning] Failed race splitting: {e}")
76+
return self
77+
78+
def clean_salary(self):
79+
"""Fix salary inconsistencies."""
80+
try:
81+
if "Salary" in self.df.columns:
82+
self.df["Salary"] = pd.to_numeric(self.df["Salary"], errors="coerce")
83+
self.df["Salary"] = self.df["Salary"].replace(60000, 28.84)
84+
except Exception as e:
85+
print(f"[Warning] Failed salary cleaning: {e}")
86+
return self
87+
88+
def finalize(self):
89+
"""Return cleaned dataframe."""
90+
return self.df

0 commit comments

Comments
 (0)