Skip to content

Commit 5c5345a

Browse files
authored
Merge branch 'main' into 4-tests-for-cleaning
2 parents e86d969 + 197dfc2 commit 5c5345a

3 files changed

Lines changed: 97 additions & 783 deletions

File tree

src/data_cleaner.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import pandas as pd
2+
3+
4+
class DataCleaner:
5+
"""
6+
General-purpose cleaner for multiple WORC datasets
7+
(Employment, Enrollments, Demographics).
8+
9+
Uses try/except for safety (does not break if col missing).
10+
Keeps all rows (no drops), but fills/fixes when possible.
11+
"""
12+
13+
def __init__(self, df: pd.DataFrame):
14+
self.df = df.copy()
15+
16+
def safe_drop_columns(self, cols_to_drop):
17+
"""Drop columns if they exist, otherwise ignore."""
18+
try:
19+
self.df = self.df.drop(columns=cols_to_drop, errors='ignore')
20+
except Exception as e:
21+
print(f"[Warning] Failed dropping columns: {e}")
22+
return self
23+
24+
def safe_fillna(self, fill_map: dict):
25+
"""Fill NaN values for specific columns safely."""
26+
for col, val in fill_map.items():
27+
try:
28+
if col in self.df.columns:
29+
self.df[col] = self.df[col].fillna(val)
30+
except Exception as e:
31+
print(f"[Warning] Failed filling NaN for {col}: {e}")
32+
return self
33+
34+
def safe_replace(self, col, replacements: dict):
35+
"""Replace values in a column safely."""
36+
try:
37+
if col in self.df.columns:
38+
self.df[col] = self.df[col].replace(replacements)
39+
except Exception as e:
40+
print(f"[Warning] Failed replacing values in {col}: {e}")
41+
return self
42+
43+
def safe_convert_dtype(self, col, dtype, errors="ignore"):
44+
"""Convert column dtype safely."""
45+
try:
46+
if col in self.df.columns:
47+
if "datetime" in str(dtype):
48+
self.df[col] = pd.to_datetime(
49+
self.df[col], errors="coerce")
50+
else:
51+
self.df[col] = self.df[col].astype(dtype, errors=errors)
52+
except Exception as e:
53+
print(f"[Warning] Failed dtype conversion on {col}: {e}")
54+
return self
55+
56+
def normalize_gender(self):
57+
"""Unify transgender categories safely."""
58+
try:
59+
if "Gender" in self.df.columns:
60+
self.df["Gender"] = self.df["Gender"].replace({
61+
"Transgender male to female": "Transgender",
62+
"Transgender female to male": "Transgender"
63+
})
64+
except Exception as e:
65+
print(f"[Warning] Failed gender normalization: {e}")
66+
return self
67+
68+
def split_race(self):
69+
"""Split Race column into Race_1, Race_2, etc., if it exists."""
70+
try:
71+
if "Race" in self.df.columns:
72+
splitting = self.df["Race"].astype(
73+
str).str.split(";", expand=True)
74+
splitting.columns = [
75+
f"Race_{i+1}" for i in range(splitting.shape[1])]
76+
self.df = pd.concat(
77+
[self.df.drop(columns=["Race"]), splitting], axis=1)
78+
except Exception as e:
79+
print(f"[Warning] Failed race splitting: {e}")
80+
return self
81+
82+
def clean_salary(self):
83+
"""Fix salary inconsistencies."""
84+
try:
85+
if "Salary" in self.df.columns:
86+
self.df["Salary"] = pd.to_numeric(
87+
self.df["Salary"], errors="coerce")
88+
self.df["Salary"] = self.df["Salary"].replace(60000, 28.84)
89+
except Exception as e:
90+
print(f"[Warning] Failed salary cleaning: {e}")
91+
return self
92+
93+
def finalize(self):
94+
"""Return cleaned dataframe."""
95+
return self.df

0 commit comments

Comments
 (0)