|
6 | 6 | "metadata": {}, |
7 | 7 | "source": [ |
8 | 8 | "### **Table of Contents**\n", |
9 | | - " * [read in data](#read-in-data)\n", |
| 9 | + " * [**Table of Contents**](#**table-of-contents**)\n", |
| 10 | + " * [Function To Read in the Data!](#function-to-read-in-the-data!)\n", |
| 11 | + " * [Example usage](#example-usage)\n", |
| 12 | + " * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", |
| 13 | + " * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", |
10 | 14 | " * [Update cleaning code](#update-cleaning-code)\n", |
11 | 15 | " * [Generate report](#generate-report)\n", |
12 | 16 | " * [Plots](#plots)" |
13 | 17 | ] |
14 | 18 | }, |
15 | 19 | { |
16 | 20 | "cell_type": "code", |
17 | | - "execution_count": 5, |
| 21 | + "execution_count": null, |
18 | 22 | "id": "d11a2343", |
19 | 23 | "metadata": {}, |
20 | 24 | "outputs": [], |
21 | 25 | "source": [ |
22 | 26 | "import pandas as pd\n", |
23 | | - "# import matplotlib.pyplot as plt\n", |
24 | | - "# import plotly.express as px\n", |
25 | | - "# import dash\n", |
26 | 27 | "from typing import Dict, Union\n", |
27 | 28 | "from pathlib import Path\n", |
28 | 29 | "import os\n", |
|
34 | 35 | "id": "0764cac1", |
35 | 36 | "metadata": {}, |
36 | 37 | "source": [ |
37 | | - "## read in data\n", |
38 | | - "Psudo code:\n", |
39 | | - "- read in all the files in the data folder \n", |
40 | | - " - accounting for them being in xlsx or csv \n", |
41 | | - "- dataframe variable name should end up being file name minus extension\n", |
42 | | - "\n", |
43 | | - "- This allows us to just drop in any export with any name and it should run. " |
| 38 | + "## Function To Read in the Data! " |
44 | 39 | ] |
45 | 40 | }, |
46 | 41 | { |
|
109 | 104 | "id": "714769cf", |
110 | 105 | "metadata": {}, |
111 | 106 | "source": [ |
112 | | - "how to call the function and display the names of each DF " |
| 107 | + "## Example usage \n", |
| 108 | + "\n", |
| 109 | + "```python \n", |
| 110 | + "dfs = load_data_folder()\n", |
| 111 | + "dfs.keys()\n", |
| 112 | + "```\n", |
| 113 | + "output:\n", |
| 114 | + "```bash\n", |
| 115 | + "dict_keys(['ARC_Enrollments', 'ARC_Application', 'All_demographics_and_programs'])\n", |
| 116 | + "```\n", |
| 117 | + "#### To Access a DataFrame in the list \n", |
| 118 | + "\n", |
| 119 | + "```python\n", |
| 120 | + "all_demo = dfs['All_demographics_and_programs']\n", |
| 121 | + "all_demo.head()\n", |
| 122 | + "```\n", |
| 123 | + "\n", |
| 124 | + "output:\n", |
| 125 | + "|col 1|col 2|col 3|\n", |
| 126 | + "|:--:|:--:|:--:|\n", |
| 127 | + "|3.14|name|apple|\n", |
| 128 | + "|3.14|name|apple|\n", |
| 129 | + "|3.14|name|apple|\n", |
| 130 | + "|3.14|name|apple|\n", |
| 131 | + "|3.14|name|apple|\n", |
| 132 | + "\n", |
| 133 | + "\n", |
| 134 | + "#### To Remove Spaces in DataFrame name\n", |
| 135 | + "\n", |
| 136 | + "```python \n", |
| 137 | + "for name, df in dfs.items():\n", |
| 138 | + " safe_name = name.replace(\" \", \"_\")\n", |
| 139 | + " globals()[safe_name] = df\n", |
| 140 | + "```" |
113 | 141 | ] |
114 | 142 | }, |
115 | 143 | { |
|
306 | 334 | }, |
307 | 335 | { |
308 | 336 | "cell_type": "code", |
309 | | - "execution_count": 20, |
| 337 | + "execution_count": null, |
310 | 338 | "id": "c3c755a4", |
311 | 339 | "metadata": {}, |
312 | 340 | "outputs": [], |
313 | 341 | "source": [ |
314 | 342 | "for name, df in dfs.items():\n", |
315 | | - " safe_name = name.replace(\" \", \"_\") # replace spaces with _\n", |
| 343 | + " safe_name = name.replace(\" \", \"_\")\n", |
316 | 344 | " globals()[safe_name] = df" |
317 | 345 | ] |
318 | 346 | }, |
|
594 | 622 | "- Ideally we will not drop anything from our data \n" |
595 | 623 | ] |
596 | 624 | }, |
| 625 | + { |
| 626 | + "cell_type": "markdown", |
| 627 | + "id": "29302c63", |
| 628 | + "metadata": {}, |
| 629 | + "source": [ |
| 630 | + "Will update this a bit with usage etc... " |
| 631 | + ] |
| 632 | + }, |
597 | 633 | { |
598 | 634 | "cell_type": "code", |
599 | 635 | "execution_count": null, |
600 | 636 | "id": "749ae60a", |
601 | 637 | "metadata": {}, |
602 | 638 | "outputs": [], |
603 | 639 | "source": [ |
604 | | - "'''\n", |
605 | | - "See the functions in files:\n", |
606 | | - "- src/Carmen_WORCEmployment.py\n", |
607 | | - "- src/cleaning_enrollments_data.py\n", |
608 | | - "- src/cleaning.py\n", |
609 | | - "'''" |
| 640 | + "class DataCleaner:\n", |
| 641 | + " \"\"\"\n", |
| 642 | + " General-purpose cleaner for multiple WORC datasets\n", |
| 643 | + " (Employment, Enrollments, Demographics).\n", |
| 644 | + "\n", |
| 645 | + " Uses try/except for safety (does not break if col missing).\n", |
| 646 | + " Keeps all rows (no drops), but fills/fixes when possible.\n", |
| 647 | + " \"\"\"\n", |
| 648 | + "\n", |
| 649 | + " def __init__(self, df: pd.DataFrame):\n", |
| 650 | + " self.df = df.copy()\n", |
| 651 | + "\n", |
| 652 | + " def safe_drop_columns(self, cols_to_drop):\n", |
| 653 | + " \"\"\"Drop columns if they exist, otherwise ignore.\"\"\"\n", |
| 654 | + " try:\n", |
| 655 | + " self.df = self.df.drop(columns=cols_to_drop, errors='ignore')\n", |
| 656 | + " except Exception as e:\n", |
| 657 | + " print(f\"[Warning] Failed dropping columns: {e}\")\n", |
| 658 | + " return self\n", |
| 659 | + "\n", |
| 660 | + " def safe_fillna(self, fill_map: dict):\n", |
| 661 | + " \"\"\"Fill NaN values for specific columns safely.\"\"\"\n", |
| 662 | + " for col, val in fill_map.items():\n", |
| 663 | + " try:\n", |
| 664 | + " if col in self.df.columns:\n", |
| 665 | + " self.df[col] = self.df[col].fillna(val)\n", |
| 666 | + " except Exception as e:\n", |
| 667 | + " print(f\"[Warning] Failed filling NaN for {col}: {e}\")\n", |
| 668 | + " return self\n", |
| 669 | + "\n", |
| 670 | + " def safe_replace(self, col, replacements: dict):\n", |
| 671 | + " \"\"\"Replace values in a column safely.\"\"\"\n", |
| 672 | + " try:\n", |
| 673 | + " if col in self.df.columns:\n", |
| 674 | + " self.df[col] = self.df[col].replace(replacements)\n", |
| 675 | + " except Exception as e:\n", |
| 676 | + " print(f\"[Warning] Failed replacing values in {col}: {e}\")\n", |
| 677 | + " return self\n", |
| 678 | + "\n", |
| 679 | + " def safe_convert_dtype(self, col, dtype, errors=\"ignore\"):\n", |
| 680 | + " \"\"\"Convert column dtype safely.\"\"\"\n", |
| 681 | + " try:\n", |
| 682 | + " if col in self.df.columns:\n", |
| 683 | + " if \"datetime\" in str(dtype):\n", |
| 684 | + " self.df[col] = pd.to_datetime(\n", |
| 685 | + " self.df[col], errors=\"coerce\")\n", |
| 686 | + " else:\n", |
| 687 | + " self.df[col] = self.df[col].astype(dtype, errors=errors)\n", |
| 688 | + " except Exception as e:\n", |
| 689 | + " print(f\"[Warning] Failed dtype conversion on {col}: {e}\")\n", |
| 690 | + " return self\n", |
| 691 | + "\n", |
| 692 | + " def normalize_gender(self):\n", |
| 693 | + " \"\"\"Unify transgender categories safely.\"\"\"\n", |
| 694 | + " try:\n", |
| 695 | + " if \"Gender\" in self.df.columns:\n", |
| 696 | + " self.df[\"Gender\"] = self.df[\"Gender\"].replace({\n", |
| 697 | + " \"Transgender male to female\": \"Transgender\",\n", |
| 698 | + " \"Transgender female to male\": \"Transgender\"\n", |
| 699 | + " })\n", |
| 700 | + " except Exception as e:\n", |
| 701 | + " print(f\"[Warning] Failed gender normalization: {e}\")\n", |
| 702 | + " return self\n", |
| 703 | + "\n", |
| 704 | + " def split_race(self):\n", |
| 705 | + " \"\"\"Split Race column into Race_1, Race_2, etc., if it exists.\"\"\"\n", |
| 706 | + " try:\n", |
| 707 | + " if \"Race\" in self.df.columns:\n", |
| 708 | + " splitting = self.df[\"Race\"].astype(\n", |
| 709 | + " str).str.split(\";\", expand=True)\n", |
| 710 | + " splitting.columns = [\n", |
| 711 | + " f\"Race_{i+1}\" for i in range(splitting.shape[1])]\n", |
| 712 | + " self.df = pd.concat(\n", |
| 713 | + " [self.df.drop(columns=[\"Race\"]), splitting], axis=1)\n", |
| 714 | + " except Exception as e:\n", |
| 715 | + " print(f\"[Warning] Failed race splitting: {e}\")\n", |
| 716 | + " return self\n", |
| 717 | + "\n", |
| 718 | + " def clean_salary(self):\n", |
| 719 | + " \"\"\"Fix salary inconsistencies.\"\"\"\n", |
| 720 | + " try:\n", |
| 721 | + " if \"Salary\" in self.df.columns:\n", |
| 722 | + " self.df[\"Salary\"] = pd.to_numeric(\n", |
| 723 | + " self.df[\"Salary\"], errors=\"coerce\")\n", |
| 724 | + " self.df[\"Salary\"] = self.df[\"Salary\"].replace(60000, 28.84)\n", |
| 725 | + " except Exception as e:\n", |
| 726 | + " print(f\"[Warning] Failed salary cleaning: {e}\")\n", |
| 727 | + " return self\n", |
| 728 | + "\n", |
| 729 | + " def finalize(self):\n", |
| 730 | + " \"\"\"Return cleaned dataframe.\"\"\"\n", |
| 731 | + " return self.df" |
610 | 732 | ] |
611 | 733 | }, |
612 | 734 | { |
|
702 | 824 | }, |
703 | 825 | { |
704 | 826 | "cell_type": "code", |
705 | | - "execution_count": null, |
| 827 | + "execution_count": 1, |
706 | 828 | "id": "d4fc7116", |
707 | 829 | "metadata": {}, |
708 | 830 | "outputs": [ |
|
714 | 836 | "--- ✅ Copy the Markdown below and paste it into a new markdown cell ---\n", |
715 | 837 | "\n", |
716 | 838 | "### **Table of Contents**\n", |
717 | | - " * [read in data](#read-in-data)\n", |
| 839 | + " * [**Table of Contents**](#**table-of-contents**)\n", |
| 840 | + " * [Function To Read in the Data!](#function-to-read-in-the-data!)\n", |
| 841 | + " * [Example usage](#example-usage)\n", |
| 842 | + " * [To Access a DataFrame in the list](#to-access-a-dataframe-in-the-list)\n", |
| 843 | + " * [To Remove Spaces in DataFrame name](#to-remove-spaces-in-dataframe-name)\n", |
718 | 844 | " * [Update cleaning code](#update-cleaning-code)\n", |
719 | 845 | " * [Generate report](#generate-report)\n", |
720 | 846 | " * [Plots](#plots)\n", |
|
754 | 880 | " print(toc_markdown)\n", |
755 | 881 | "\n", |
756 | 882 | "\n", |
757 | | - "notebook_path = 'ideal.ipynb'\n", |
| 883 | + "notebook_path = 'mainNb.ipynb'\n", |
758 | 884 | "generate_toc_from_notebook(notebook_path)\n" |
759 | 885 | ] |
760 | 886 | } |
761 | 887 | ], |
762 | 888 | "metadata": { |
763 | 889 | "kernelspec": { |
764 | | - "display_name": "venv (3.12.2)", |
| 890 | + "display_name": ".venv", |
765 | 891 | "language": "python", |
766 | 892 | "name": "python3" |
767 | 893 | }, |
|
775 | 901 | "name": "python", |
776 | 902 | "nbconvert_exporter": "python", |
777 | 903 | "pygments_lexer": "ipython3", |
778 | | - "version": "3.12.2" |
| 904 | + "version": "3.13.1" |
779 | 905 | } |
780 | 906 | }, |
781 | 907 | "nbformat": 4, |
|
0 commit comments