added cleaner to mainnb

dmorton714 · dmorton714 · commit 2c09841ca9b2 · 2025-08-20T09:57:54.000-04:00
diff --git a/src/data_cleaner.py b/src/data_cleaner.py
@@ -1,11 +1,11 @@
 import pandas as pd
-import numpy as np
+
 
 class DataCleaner:
     """
     General-purpose cleaner for multiple WORC datasets
     (Employment, Enrollments, Demographics).
-    
+
     Uses try/except for safety (does not break if col missing).
     Keeps all rows (no drops), but fills/fixes when possible.
     """
@@ -45,7 +45,8 @@ def safe_convert_dtype(self, col, dtype, errors="ignore"):
         try:
             if col in self.df.columns:
                 if "datetime" in str(dtype):
-                    self.df[col] = pd.to_datetime(self.df[col], errors="coerce")
+                    self.df[col] = pd.to_datetime(
+                        self.df[col], errors="coerce")
                 else:
                     self.df[col] = self.df[col].astype(dtype, errors=errors)
         except Exception as e:
@@ -68,9 +69,12 @@ def split_race(self):
         """Split Race column into Race_1, Race_2, etc., if it exists."""
         try:
             if "Race" in self.df.columns:
-                splitting = self.df["Race"].astype(str).str.split(";", expand=True)
-                splitting.columns = [f"Race_{i+1}" for i in range(splitting.shape[1])]
-                self.df = pd.concat([self.df.drop(columns=["Race"]), splitting], axis=1)
+                splitting = self.df["Race"].astype(
+                    str).str.split(";", expand=True)
+                splitting.columns = [
+                    f"Race_{i+1}" for i in range(splitting.shape[1])]
+                self.df = pd.concat(
+                    [self.df.drop(columns=["Race"]), splitting], axis=1)
         except Exception as e:
             print(f"[Warning] Failed race splitting: {e}")
         return self
@@ -79,7 +83,8 @@ def clean_salary(self):
         """Fix salary inconsistencies."""
         try:
             if "Salary" in self.df.columns:
-                self.df["Salary"] = pd.to_numeric(self.df["Salary"], errors="coerce")
+                self.df["Salary"] = pd.to_numeric(
+                    self.df["Salary"], errors="coerce")
                 self.df["Salary"] = self.df["Salary"].replace(60000, 28.84)
         except Exception as e:
             print(f"[Warning] Failed salary cleaning: {e}")
diff --git a/src/notebooks/mainNb.ipynb b/src/notebooks/mainNb.ipynb
@@ -6,23 +6,24 @@
    "metadata": {},
    "source": [
     "### **Table of Contents**\n",
-    "  * [read in data](#read-in-data)\n",
+    "    * [**Table of Contents**](#**table-of-contents**)\n",
+    "  * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
+    "  * [Example usage](#example-usage)\n",
+    "      * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
+    "      * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
     "  * [Update cleaning code](#update-cleaning-code)\n",
     "  * [Generate report](#generate-report)\n",
     "  * [Plots](#plots)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "d11a2343",
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "# import matplotlib.pyplot as plt\n",
-    "# import plotly.express as px\n",
-    "# import dash\n",
     "from typing import Dict, Union\n",
     "from pathlib import Path\n",
     "import os\n",
@@ -34,13 +35,7 @@
    "id": "0764cac1",
    "metadata": {},
    "source": [
-    "## read in data\n",
-    "Psudo code:\n",
-    "- read in all the files in the data folder \n",
-    "  - accounting for them being in xlsx or csv \n",
-    "- dataframe variable name should end up being file name minus extension\n",
-    "\n",
-    "- This allows us to just drop in any export with any name and it should run. "
+    "## Function To Read in the Data! "
    ]
   },
   {
@@ -109,7 +104,40 @@
    "id": "714769cf",
    "metadata": {},
    "source": [
-    "how to call the function and display the names of each DF "
+    "## Example usage \n",
+    "\n",
+    "```python \n",
+    "dfs = load_data_folder()\n",
+    "dfs.keys()\n",
+    "```\n",
+    "output:\n",
+    "```bash\n",
+    "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n",
+    "```\n",
+    "#### To Access a DataFrame in the list \n",
+    "\n",
+    "```python\n",
+    "all_demo = dfs['All_demographics_and_programs']\n",
+    "all_demo.head()\n",
+    "```\n",
+    "\n",
+    "output:\n",
+    "|col 1|col 2|col 3|\n",
+    "|:--:|:--:|:--:|\n",
+    "|3.14|name|apple|\n",
+    "|3.14|name|apple|\n",
+    "|3.14|name|apple|\n",
+    "|3.14|name|apple|\n",
+    "|3.14|name|apple|\n",
+    "\n",
+    "\n",
+    "#### To Remove Spaces in DataFrame name\n",
+    "\n",
+    "```python \n",
+    "for name, df in dfs.items():\n",
+    "    safe_name = name.replace(\" \", \"_\")\n",
+    "    globals()[safe_name] = df\n",
+    "```"
    ]
   },
   {
@@ -306,13 +334,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "c3c755a4",
    "metadata": {},
    "outputs": [],
    "source": [
     "for name, df in dfs.items():\n",
-    "    safe_name = name.replace(\" \", \"_\")  # replace spaces with _\n",
+    "    safe_name = name.replace(\" \", \"_\")\n",
     "    globals()[safe_name] = df"
    ]
   },
@@ -594,19 +622,113 @@
     "- Ideally we will not drop anything from our data \n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "29302c63",
+   "metadata": {},
+   "source": [
+    "Will update this a bit with usage etc... "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "749ae60a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "'''\n",
-    "See the functions in files:\n",
-    "- src/Carmen_WORCEmployment.py\n",
-    "- src/cleaning_enrollments_data.py\n",
-    "- src/cleaning.py\n",
-    "'''"
+    "class DataCleaner:\n",
+    "    \"\"\"\n",
+    "    General-purpose cleaner for multiple WORC datasets\n",
+    "    (Employment, Enrollments, Demographics).\n",
+    "\n",
+    "    Uses try/except for safety (does not break if col missing).\n",
+    "    Keeps all rows (no drops), but fills/fixes when possible.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, df: pd.DataFrame):\n",
+    "        self.df = df.copy()\n",
+    "\n",
+    "    def safe_drop_columns(self, cols_to_drop):\n",
+    "        \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n",
+    "        try:\n",
+    "            self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed dropping columns: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_fillna(self, fill_map: dict):\n",
+    "        \"\"\"Fill NaN values for specific columns safely.\"\"\"\n",
+    "        for col, val in fill_map.items():\n",
+    "            try:\n",
+    "                if col in self.df.columns:\n",
+    "                    self.df[col] = self.df[col].fillna(val)\n",
+    "            except Exception as e:\n",
+    "                print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_replace(self, col, replacements: dict):\n",
+    "        \"\"\"Replace values in a column safely.\"\"\"\n",
+    "        try:\n",
+    "            if col in self.df.columns:\n",
+    "                self.df[col] = self.df[col].replace(replacements)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed replacing values in {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n",
+    "        \"\"\"Convert column dtype safely.\"\"\"\n",
+    "        try:\n",
+    "            if col in self.df.columns:\n",
+    "                if \"datetime\" in str(dtype):\n",
+    "                    self.df[col] = pd.to_datetime(\n",
+    "                        self.df[col], errors=\"coerce\")\n",
+    "                else:\n",
+    "                    self.df[col] = self.df[col].astype(dtype, errors=errors)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def normalize_gender(self):\n",
+    "        \"\"\"Unify transgender categories safely.\"\"\"\n",
+    "        try:\n",
+    "            if \"Gender\" in self.df.columns:\n",
+    "                self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n",
+    "                    \"Transgender male to female\": \"Transgender\",\n",
+    "                    \"Transgender female to male\": \"Transgender\"\n",
+    "                })\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed gender normalization: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def split_race(self):\n",
+    "        \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n",
+    "        try:\n",
+    "            if \"Race\" in self.df.columns:\n",
+    "                splitting = self.df[\"Race\"].astype(\n",
+    "                    str).str.split(\";\", expand=True)\n",
+    "                splitting.columns = [\n",
+    "                    f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n",
+    "                self.df = pd.concat(\n",
+    "                    [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed race splitting: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def clean_salary(self):\n",
+    "        \"\"\"Fix salary inconsistencies.\"\"\"\n",
+    "        try:\n",
+    "            if \"Salary\" in self.df.columns:\n",
+    "                self.df[\"Salary\"] = pd.to_numeric(\n",
+    "                    self.df[\"Salary\"], errors=\"coerce\")\n",
+    "                self.df[\"Salary\"] = self.df[\"Salary\"].replace(60000, 28.84)\n",
+    "        except Exception as e:\n",
+    "            print(f\"[Warning] Failed salary cleaning: {e}\")\n",
+    "        return self\n",
+    "\n",
+    "    def finalize(self):\n",
+    "        \"\"\"Return cleaned dataframe.\"\"\"\n",
+    "        return self.df"
    ]
   },
   {
@@ -702,7 +824,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "d4fc7116",
    "metadata": {},
    "outputs": [
@@ -714,7 +836,11 @@
       "--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n",
       "\n",
       "### **Table of Contents**\n",
-      "  * [read in data](#read-in-data)\n",
+      "    * [**Table of Contents**](#**table-of-contents**)\n",
+      "  * [Function To Read in the Data!](#function-to-read-in-the-data!)\n",
+      "  * [Example usage](#example-usage)\n",
+      "      * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n",
+      "      * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n",
       "  * [Update cleaning code](#update-cleaning-code)\n",
       "  * [Generate report](#generate-report)\n",
       "  * [Plots](#plots)\n",
@@ -754,14 +880,14 @@
     "    print(toc_markdown)\n",
     "\n",
     "\n",
-    "notebook_path = 'ideal.ipynb'\n",
+    "notebook_path = 'mainNb.ipynb'\n",
     "generate_toc_from_notebook(notebook_path)\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv (3.12.2)",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -775,7 +901,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,