Skip to content

Commit 2c09841

Browse files
committed
added cleaner to mainnb
1 parent 8399214 commit 2c09841

2 files changed

Lines changed: 164 additions & 33 deletions

File tree

src/data_cleaner.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import pandas as pd
2-
import numpy as np
2+
33

44
class DataCleaner:
55
"""
66
General-purpose cleaner for multiple WORC datasets
77
(Employment, Enrollments, Demographics).
8-
8+
99
Uses try/except for safety (does not break if col missing).
1010
Keeps all rows (no drops), but fills/fixes when possible.
1111
"""
@@ -45,7 +45,8 @@ def safe_convert_dtype(self, col, dtype, errors="ignore"):
4545
try:
4646
if col in self.df.columns:
4747
if "datetime" in str(dtype):
48-
self.df[col] = pd.to_datetime(self.df[col], errors="coerce")
48+
self.df[col] = pd.to_datetime(
49+
self.df[col], errors="coerce")
4950
else:
5051
self.df[col] = self.df[col].astype(dtype, errors=errors)
5152
except Exception as e:
@@ -68,9 +69,12 @@ def split_race(self):
6869
"""Split Race column into Race_1, Race_2, etc., if it exists."""
6970
try:
7071
if "Race" in self.df.columns:
71-
splitting = self.df["Race"].astype(str).str.split(";", expand=True)
72-
splitting.columns = [f"Race_{i+1}" for i in range(splitting.shape[1])]
73-
self.df = pd.concat([self.df.drop(columns=["Race"]), splitting], axis=1)
72+
splitting = self.df["Race"].astype(
73+
str).str.split(";", expand=True)
74+
splitting.columns = [
75+
f"Race_{i+1}" for i in range(splitting.shape[1])]
76+
self.df = pd.concat(
77+
[self.df.drop(columns=["Race"]), splitting], axis=1)
7478
except Exception as e:
7579
print(f"[Warning] Failed race splitting: {e}")
7680
return self
@@ -79,7 +83,8 @@ def clean_salary(self):
7983
"""Fix salary inconsistencies."""
8084
try:
8185
if "Salary" in self.df.columns:
82-
self.df["Salary"] = pd.to_numeric(self.df["Salary"], errors="coerce")
86+
self.df["Salary"] = pd.to_numeric(
87+
self.df["Salary"], errors="coerce")
8388
self.df["Salary"] = self.df["Salary"].replace(60000, 28.84)
8489
except Exception as e:
8590
print(f"[Warning] Failed salary cleaning: {e}")
Lines changed: 152 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,24 @@
66
"metadata": {},
77
"source": [
88
"### **Table of Contents**\n",
9-
" * [read in data](#read-in-data)\n",
9+
" * [**Table of Contents**](#**table-of-contents**)\n",
10+
" * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
11+
" * [Example usage](#example-usage)\n",
12+
" * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
13+
" * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
1014
" * [Update cleaning code](#update-cleaning-code)\n",
1115
" * [Generate report](#generate-report)\n",
1216
" * [Plots](#plots)"
1317
]
1418
},
1519
{
1620
"cell_type": "code",
17-
"execution_count": 5,
21+
"execution_count": null,
1822
"id": "d11a2343",
1923
"metadata": {},
2024
"outputs": [],
2125
"source": [
2226
"import pandas as pd\n",
23-
"# import matplotlib.pyplot as plt\n",
24-
"# import plotly.express as px\n",
25-
"# import dash\n",
2627
"from typing import Dict, Union\n",
2728
"from pathlib import Path\n",
2829
"import os\n",
@@ -34,13 +35,7 @@
3435
"id": "0764cac1",
3536
"metadata": {},
3637
"source": [
37-
"## read in data\n",
38-
"Psudo code:\n",
39-
"- read in all the files in the data folder \n",
40-
" - accounting for them being in xlsx or csv \n",
41-
"- dataframe variable name should end up being file name minus extension\n",
42-
"\n",
43-
"- This allows us to just drop in any export with any name and it should run. "
38+
"## Function To Read in the Data! "
4439
]
4540
},
4641
{
@@ -109,7 +104,40 @@
109104
"id": "714769cf",
110105
"metadata": {},
111106
"source": [
112-
"how to call the function and display the names of each DF "
107+
"## Example usage \n",
108+
"\n",
109+
"```python \n",
110+
"dfs = load_data_folder()\n",
111+
"dfs.keys()\n",
112+
"```\n",
113+
"output:\n",
114+
"```bash\n",
115+
"dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n",
116+
"```\n",
117+
"#### To Access a DataFrame in the list \n",
118+
"\n",
119+
"```python\n",
120+
"all_demo = dfs['All_demographics_and_programs']\n",
121+
"all_demo.head()\n",
122+
"```\n",
123+
"\n",
124+
"output:\n",
125+
"|col 1|col 2|col 3|\n",
126+
"|:--:|:--:|:--:|\n",
127+
"|3.14|name|apple|\n",
128+
"|3.14|name|apple|\n",
129+
"|3.14|name|apple|\n",
130+
"|3.14|name|apple|\n",
131+
"|3.14|name|apple|\n",
132+
"\n",
133+
"\n",
134+
"#### To Remove Spaces in DataFrame name\n",
135+
"\n",
136+
"```python \n",
137+
"for name, df in dfs.items():\n",
138+
" safe_name = name.replace(\" \", \"_\")\n",
139+
" globals()[safe_name] = df\n",
140+
"```"
113141
]
114142
},
115143
{
@@ -306,13 +334,13 @@
306334
},
307335
{
308336
"cell_type": "code",
309-
"execution_count": 20,
337+
"execution_count": null,
310338
"id": "c3c755a4",
311339
"metadata": {},
312340
"outputs": [],
313341
"source": [
314342
"for name, df in dfs.items():\n",
315-
" safe_name = name.replace(\" \", \"_\") # replace spaces with _\n",
343+
" safe_name = name.replace(\" \", \"_\")\n",
316344
" globals()[safe_name] = df"
317345
]
318346
},
@@ -594,19 +622,113 @@
594622
"- Ideally we will not drop anything from our data \n"
595623
]
596624
},
625+
{
626+
"cell_type": "markdown",
627+
"id": "29302c63",
628+
"metadata": {},
629+
"source": [
630+
"Will update this a bit with usage etc... "
631+
]
632+
},
597633
{
598634
"cell_type": "code",
599635
"execution_count": null,
600636
"id": "749ae60a",
601637
"metadata": {},
602638
"outputs": [],
603639
"source": [
604-
"'''\n",
605-
"See the functions in files:\n",
606-
"- src/Carmen_WORCEmployment.py\n",
607-
"- src/cleaning_enrollments_data.py\n",
608-
"- src/cleaning.py\n",
609-
"'''"
640+
"class DataCleaner:\n",
641+
" \"\"\"\n",
642+
" General-purpose cleaner for multiple WORC datasets\n",
643+
" (Employment, Enrollments, Demographics).\n",
644+
"\n",
645+
" Uses try/except for safety (does not break if col missing).\n",
646+
" Keeps all rows (no drops), but fills/fixes when possible.\n",
647+
" \"\"\"\n",
648+
"\n",
649+
" def __init__(self, df: pd.DataFrame):\n",
650+
" self.df = df.copy()\n",
651+
"\n",
652+
" def safe_drop_columns(self, cols_to_drop):\n",
653+
" \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n",
654+
" try:\n",
655+
" self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n",
656+
" except Exception as e:\n",
657+
" print(f\"[Warning] Failed dropping columns: {e}\")\n",
658+
" return self\n",
659+
"\n",
660+
" def safe_fillna(self, fill_map: dict):\n",
661+
" \"\"\"Fill NaN values for specific columns safely.\"\"\"\n",
662+
" for col, val in fill_map.items():\n",
663+
" try:\n",
664+
" if col in self.df.columns:\n",
665+
" self.df[col] = self.df[col].fillna(val)\n",
666+
" except Exception as e:\n",
667+
" print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n",
668+
" return self\n",
669+
"\n",
670+
" def safe_replace(self, col, replacements: dict):\n",
671+
" \"\"\"Replace values in a column safely.\"\"\"\n",
672+
" try:\n",
673+
" if col in self.df.columns:\n",
674+
" self.df[col] = self.df[col].replace(replacements)\n",
675+
" except Exception as e:\n",
676+
" print(f\"[Warning] Failed replacing values in {col}: {e}\")\n",
677+
" return self\n",
678+
"\n",
679+
" def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n",
680+
" \"\"\"Convert column dtype safely.\"\"\"\n",
681+
" try:\n",
682+
" if col in self.df.columns:\n",
683+
" if \"datetime\" in str(dtype):\n",
684+
" self.df[col] = pd.to_datetime(\n",
685+
" self.df[col], errors=\"coerce\")\n",
686+
" else:\n",
687+
" self.df[col] = self.df[col].astype(dtype, errors=errors)\n",
688+
" except Exception as e:\n",
689+
" print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n",
690+
" return self\n",
691+
"\n",
692+
" def normalize_gender(self):\n",
693+
" \"\"\"Unify transgender categories safely.\"\"\"\n",
694+
" try:\n",
695+
" if \"Gender\" in self.df.columns:\n",
696+
" self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n",
697+
" \"Transgender male to female\": \"Transgender\",\n",
698+
" \"Transgender female to male\": \"Transgender\"\n",
699+
" })\n",
700+
" except Exception as e:\n",
701+
" print(f\"[Warning] Failed gender normalization: {e}\")\n",
702+
" return self\n",
703+
"\n",
704+
" def split_race(self):\n",
705+
" \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n",
706+
" try:\n",
707+
" if \"Race\" in self.df.columns:\n",
708+
" splitting = self.df[\"Race\"].astype(\n",
709+
" str).str.split(\";\", expand=True)\n",
710+
" splitting.columns = [\n",
711+
" f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n",
712+
" self.df = pd.concat(\n",
713+
" [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n",
714+
" except Exception as e:\n",
715+
" print(f\"[Warning] Failed race splitting: {e}\")\n",
716+
" return self\n",
717+
"\n",
718+
" def clean_salary(self):\n",
719+
" \"\"\"Fix salary inconsistencies.\"\"\"\n",
720+
" try:\n",
721+
" if \"Salary\" in self.df.columns:\n",
722+
" self.df[\"Salary\"] = pd.to_numeric(\n",
723+
" self.df[\"Salary\"], errors=\"coerce\")\n",
724+
" self.df[\"Salary\"] = self.df[\"Salary\"].replace(60000, 28.84)\n",
725+
" except Exception as e:\n",
726+
" print(f\"[Warning] Failed salary cleaning: {e}\")\n",
727+
" return self\n",
728+
"\n",
729+
" def finalize(self):\n",
730+
" \"\"\"Return cleaned dataframe.\"\"\"\n",
731+
" return self.df"
610732
]
611733
},
612734
{
@@ -702,7 +824,7 @@
702824
},
703825
{
704826
"cell_type": "code",
705-
"execution_count": null,
827+
"execution_count": 1,
706828
"id": "d4fc7116",
707829
"metadata": {},
708830
"outputs": [
@@ -714,7 +836,11 @@
714836
"--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n",
715837
"\n",
716838
"### **Table of Contents**\n",
717-
" * [read in data](#read-in-data)\n",
839+
" * [**Table of Contents**](#**table-of-contents**)\n",
840+
" * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
841+
" * [Example usage](#example-usage)\n",
842+
" * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
843+
" * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
718844
" * [Update cleaning code](#update-cleaning-code)\n",
719845
" * [Generate report](#generate-report)\n",
720846
" * [Plots](#plots)\n",
@@ -754,14 +880,14 @@
754880
" print(toc_markdown)\n",
755881
"\n",
756882
"\n",
757-
"notebook_path = 'ideal.ipynb'\n",
883+
"notebook_path = 'mainNb.ipynb'\n",
758884
"generate_toc_from_notebook(notebook_path)\n"
759885
]
760886
}
761887
],
762888
"metadata": {
763889
"kernelspec": {
764-
"display_name": "venv (3.12.2)",
890+
"display_name": ".venv",
765891
"language": "python",
766892
"name": "python3"
767893
},
@@ -775,7 +901,7 @@
775901
"name": "python",
776902
"nbconvert_exporter": "python",
777903
"pygments_lexer": "ipython3",
778-
"version": "3.12.2"
904+
"version": "3.13.1"
779905
}
780906
},
781907
"nbformat": 4,

0 commit comments

Comments
 (0)