From 08590b5ac4d812f3158a2ab0becb6eba22669e6d Mon Sep 17 00:00:00 2001 From: Nhan Date: Thu, 5 Oct 2023 21:40:52 +0200 Subject: [PATCH 1/2] change to Hello Nhan --- your-code/main.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 472c4a1..57db855 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -24,7 +24,7 @@ }, "outputs": [], "source": [ - "print(\"Hellow world\")" + "print(\"Hello Nhan\")" ] } ], From 49ac420dae557ad5ca81b181e17508d493c1c793 Mon Sep 17 00:00:00 2001 From: nhannguyen-th <142213298+nhannguyen-th@users.noreply.github.com> Date: Sun, 15 Oct 2023 22:40:44 +0200 Subject: [PATCH 2/2] lesson --- Day_2_data_cleaning.ipynb | 2819 +++++++++++++++++++++++++++++++++++++ 1 file changed, 2819 insertions(+) create mode 100644 Day_2_data_cleaning.ipynb diff --git a/Day_2_data_cleaning.ipynb b/Day_2_data_cleaning.ipynb new file mode 100644 index 0000000..1bc3e63 --- /dev/null +++ b/Day_2_data_cleaning.ipynb @@ -0,0 +1,2819 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n1O6xmlqQOSF", + "outputId": "4010e740-4570-434f-ad56-930251a7d052" + }, + "source": [ + "print('hello world')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "hello world\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "T7gksDVOqvhU" + }, + "source": [ + "# importing libraries\n", + "import pandas as pd\n", + "import numpy as np" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CM10kdEyV1aK" + }, + "source": [ + "# Advanced data loading and exporting" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 530 + }, + "id": "vpJjn-ziV1aL", + "outputId": "24b5537c-1f4a-48aa-d5f2-33daf6ee3ab0" + }, + "source": [ + "#We have seen the most common data load techniques in pandas\n", + "#the stragglers are loading data separated by not-as-common separators\n", + "\n", + "# Import tab-delimited file\n", + "data = pd.read_csv('vehicles_tab.txt', sep='\\t')\n", + "\n", + "# Import pipe-delimited file\n", + "data = pd.read_csv('vehicles_pipe.txt', sep='|')\n", + "data.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Make Model Year Engine Displacement \\\n", + "0 AM General DJ Po Vehicle 2WD 1984 2.5 \n", + "1 AM General FJ8c Post Office 1984 4.2 \n", + "2 AM General Post Office DJ5 2WD 1985 2.5 \n", + "3 AM General Post Office DJ8 2WD 1985 4.2 \n", + "4 ASC Incorporated GNX 1987 3.8 \n", + "\n", + " Cylinders Transmission Drivetrain Vehicle Class \\\n", + "0 4.0 Automatic 3-spd 2-Wheel Drive Special Purpose Vehicle 2WD \n", + "1 6.0 Automatic 3-spd 2-Wheel Drive Special Purpose Vehicle 2WD \n", + "2 4.0 Automatic 3-spd Rear-Wheel Drive Special Purpose Vehicle 2WD \n", + "3 6.0 Automatic 3-spd Rear-Wheel Drive Special Purpose Vehicle 2WD \n", + "4 6.0 Automatic 4-spd Rear-Wheel Drive Midsize Cars \n", + "\n", + " Fuel Type Fuel Barrels/Year City MPG Highway MPG Combined MPG \\\n", + "0 Regular 19.388824 18 17 17 \n", + "1 Regular 25.354615 13 13 13 \n", + "2 Regular 20.600625 16 17 16 \n", + "3 Regular 25.354615 13 13 13 \n", + "4 Premium 20.600625 14 21 16 \n", + "\n", + " CO2 Emission Grams/Mile Fuel Cost/Year \n", + "0 522.764706 1950 \n", + "1 683.615385 2550 \n", + "2 555.437500 2100 \n", + "3 683.615385 2550 \n", + "4 555.437500 2550 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MakeModelYearEngine DisplacementCylindersTransmissionDrivetrainVehicle ClassFuel TypeFuel Barrels/YearCity MPGHighway MPGCombined MPGCO2 Emission Grams/MileFuel Cost/Year
0AM GeneralDJ Po Vehicle 2WD19842.54.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular19.388824181717522.7647061950
1AM GeneralFJ8c Post Office19844.26.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
2AM GeneralPost Office DJ5 2WD19852.54.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular20.600625161716555.4375002100
3AM GeneralPost Office DJ8 2WD19854.26.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
4ASC IncorporatedGNX19873.86.0Automatic 4-spdRear-Wheel DriveMidsize CarsPremium20.600625142116555.4375002550
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 530 + }, + "id": "qD88ohxCzvjH", + "outputId": "0e55892a-dce9-4b49-fb85-ad5172c9b3e9" + }, + "source": [ + "#JSON is a format used for transmitting structured data between web applications\n", + "#Not usually used to store data, but sometimes co-opted for that\n", + "data = pd.read_json('vehicles.json')\n", + "data.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Make Model Year Engine Displacement \\\n", + "0 AM General DJ Po Vehicle 2WD 1984 2.5 \n", + "1 AM General FJ8c Post Office 1984 4.2 \n", + "2 AM General Post Office DJ5 2WD 1985 2.5 \n", + "3 AM General Post Office DJ8 2WD 1985 4.2 \n", + "4 ASC Incorporated GNX 1987 3.8 \n", + "\n", + " Cylinders Transmission Drivetrain Vehicle Class \\\n", + "0 4 Automatic 3-spd 2-Wheel Drive Special Purpose Vehicle 2WD \n", + "1 6 Automatic 3-spd 2-Wheel Drive Special Purpose Vehicle 2WD \n", + "2 4 Automatic 3-spd Rear-Wheel Drive Special Purpose Vehicle 2WD \n", + "3 6 Automatic 3-spd Rear-Wheel Drive Special Purpose Vehicle 2WD \n", + "4 6 Automatic 4-spd Rear-Wheel Drive Midsize Cars \n", + "\n", + " Fuel Type Fuel Barrels/Year City MPG Highway MPG Combined MPG \\\n", + "0 Regular 19.388824 18 17 17 \n", + "1 Regular 25.354615 13 13 13 \n", + "2 Regular 20.600625 16 17 16 \n", + "3 Regular 25.354615 13 13 13 \n", + "4 Premium 20.600625 14 21 16 \n", + "\n", + " CO2 Emission Grams/Mile Fuel Cost/Year \n", + "0 522.764706 1950 \n", + "1 683.615385 2550 \n", + "2 555.437500 2100 \n", + "3 683.615385 2550 \n", + "4 555.437500 2550 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MakeModelYearEngine DisplacementCylindersTransmissionDrivetrainVehicle ClassFuel TypeFuel Barrels/YearCity MPGHighway MPGCombined MPGCO2 Emission Grams/MileFuel Cost/Year
0AM GeneralDJ Po Vehicle 2WD19842.54Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular19.388824181717522.7647061950
1AM GeneralFJ8c Post Office19844.26Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
2AM GeneralPost Office DJ5 2WD19852.54Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular20.600625161716555.4375002100
3AM GeneralPost Office DJ8 2WD19854.26Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
4ASC IncorporatedGNX19873.86Automatic 4-spdRear-Wheel DriveMidsize CarsPremium20.600625142116555.4375002550
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bBR079zt4Xnm" + }, + "source": [ + "#pandas can write to different formats of JSON so that you are always able to communicate witht he downstream service easily\n", + "\n", + "#each line as a list of dictionaries, column names as keys\n", + "data.to_json('test_records.json',orient='records')\n", + "#dictionary for each index, colum names as keys in subdictionsry\n", + "data.to_json('test_index.json',orient='index')\n", + "#as a list of lists\n", + "data.to_json('test_values.json',orient='values')\n", + "#as a list of lists, index columns and index lines separated at the head of document\n", + "data.to_json('test_split.json',orient='split')\n", + "#each column as a key with a dictionary for each line, key is index\n", + "data.to_json('test_columns.json',orient='columns')\n", + "#description of shcema at head, list of lines as dictionaries following\n", + "data.to_json('test_table.json',orient='table')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9GA2ME7TV1ac" + }, + "source": [ + "# Data cleaning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V-_KMoGFQOSJ" + }, + "source": [ + "## First step - import the dataset and look at it!!" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "okqtY1QyQOSK", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8d491075-8ef1-4417-aba1-87c09bac4577" + }, + "source": [ + "# load Dataframe\n", + "ds = pd.read_csv('vehicles_messy.csv')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "U3rRjuwVQOSM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "325fd52c-d43a-4068-ec56-26f5234bff51" + }, + "source": [ + "# check how many columns and entries (so you have an idea when comparing with missing values)\n", + "ds.shape" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(37843, 83)" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WMLxD3QYQOSO", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 394 + }, + "outputId": "166004a0-f429-4c28-c832-0cee81a6cce6" + }, + "source": [ + "# get a quick overview of the dataset as a whole, column by column\n", + "ds.describe()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 \\\n", + "count 37843.000000 37843.000000 37843.0 37843.000000 37843.000000 \n", + "mean 17.532506 0.216169 0.0 0.023531 17.941389 \n", + "std 4.575950 1.141527 0.0 0.427647 6.660360 \n", + "min 0.060000 0.000000 0.0 0.000000 6.000000 \n", + "25% 14.330870 0.000000 0.0 0.000000 15.000000 \n", + "50% 17.347895 0.000000 0.0 0.000000 17.000000 \n", + "75% 20.600625 0.000000 0.0 0.000000 20.000000 \n", + "max 47.087143 18.311667 0.0 12.000000 138.000000 \n", + "\n", + " city08U cityA08 cityA08U cityCD cityE \\\n", + "count 37843.000000 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean 4.042737 0.520149 0.327163 0.000406 0.184790 \n", + "std 9.645820 3.837874 3.542596 0.039918 2.904558 \n", + "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "75% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "max 138.304000 127.000000 127.093000 5.350000 122.000000 \n", + "\n", + " ... UCity UCityA UHighway UHighwayA \\\n", + "count ... 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean ... 22.587229 0.652380 33.619221 0.933845 \n", + "std ... 9.350163 5.284547 10.048326 6.059456 \n", + "min ... 0.000000 0.000000 0.000000 0.000000 \n", + "25% ... 18.000000 0.000000 27.100000 0.000000 \n", + "50% ... 21.000000 0.000000 33.000000 0.000000 \n", + "75% ... 25.139300 0.000000 38.109600 0.000000 \n", + "max ... 197.577100 181.560900 159.100000 152.187800 \n", + "\n", + " year youSaveSpend charge240b phevCity phevHwy \\\n", + "count 37843.000000 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean 2000.064398 -2658.999022 0.004360 0.069313 0.068203 \n", + "std 10.390588 2553.098329 0.142776 1.966806 1.871986 \n", + "min 1984.000000 -22250.000000 0.000000 0.000000 0.000000 \n", + "25% 1990.000000 -4250.000000 0.000000 0.000000 0.000000 \n", + "50% 2001.000000 -2500.000000 0.000000 0.000000 0.000000 \n", + "75% 2009.000000 -750.000000 0.000000 0.000000 0.000000 \n", + "max 2017.000000 4000.000000 7.000000 97.000000 79.000000 \n", + "\n", + " phevComb \n", + "count 37843.000000 \n", + "mean 0.068573 \n", + "std 1.913647 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 0.000000 \n", + "75% 0.000000 \n", + "max 88.000000 \n", + "\n", + "[8 rows x 59 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...UCityUCityAUHighwayUHighwayAyearyouSaveSpendcharge240bphevCityphevHwyphevComb
count37843.00000037843.00000037843.037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.000000...37843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.000000
mean17.5325060.2161690.00.02353117.9413894.0427370.5201490.3271630.0004060.184790...22.5872290.65238033.6192210.9338452000.064398-2658.9990220.0043600.0693130.0682030.068573
std4.5759501.1415270.00.4276476.6603609.6458203.8378743.5425960.0399182.904558...9.3501635.28454710.0483266.05945610.3905882553.0983290.1427761.9668061.8719861.913647
min0.0600000.0000000.00.0000006.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000001984.000000-22250.0000000.0000000.0000000.0000000.000000
25%14.3308700.0000000.00.00000015.0000000.0000000.0000000.0000000.0000000.000000...18.0000000.00000027.1000000.0000001990.000000-4250.0000000.0000000.0000000.0000000.000000
50%17.3478950.0000000.00.00000017.0000000.0000000.0000000.0000000.0000000.000000...21.0000000.00000033.0000000.0000002001.000000-2500.0000000.0000000.0000000.0000000.000000
75%20.6006250.0000000.00.00000020.0000000.0000000.0000000.0000000.0000000.000000...25.1393000.00000038.1096000.0000002009.000000-750.0000000.0000000.0000000.0000000.000000
max47.08714318.3116670.012.000000138.000000138.304000127.000000127.0930005.350000122.000000...197.577100181.560900159.100000152.1878002017.0000004000.0000007.00000097.00000079.00000088.000000
\n", + "

8 rows × 59 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Is8GlixRV1ak", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "outputId": "0d55e608-cea1-47d0-8fb4-537538fcf098" + }, + "source": [ + "# peek at the actual data! Data science is hands on!\n", + "ds.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n", + "0 15.695714 0.0 0.0 0.0 19 0.0 0 \n", + "1 29.964545 0.0 0.0 0.0 9 0.0 0 \n", + "2 12.207778 0.0 0.0 0.0 23 0.0 0 \n", + "3 29.964545 0.0 0.0 0.0 10 0.0 0 \n", + "4 17.347895 0.0 0.0 0.0 17 0.0 0 \n", + "\n", + " cityA08U cityCD cityE ... mfrCode c240Dscr charge240b c240bDscr \\\n", + "0 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "1 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "2 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "3 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "4 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "\n", + " createdOn modifiedOn startStop \\\n", + "0 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "1 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "2 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "3 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "4 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "\n", + " phevCity phevHwy phevComb \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + "[5 rows x 83 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...mfrCodec240Dscrcharge240bc240bDscrcreatedOnmodifiedOnstartStopphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
129.9645450.00.00.090.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
212.2077780.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
329.9645450.00.00.0100.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
417.3478950.00.00.0170.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
\n", + "

5 rows × 83 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TYiwQjOeQOSR" + }, + "source": [ + "## Data Cleaning 1 - check missing or (abnormally high) zero values" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jIBzx-hxQOSR", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1b4c0cca-8a30-48d1-fabd-1a1942a92a75" + }, + "source": [ + "# Check for Missing Values. Is this easy to see? When to drop the lines and when to drop the column.\n", + "#isnull and isna are the same method, it's a stylistic preference\n", + "ds.isnull()\n", + "ds.isnull().sum()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "barrels08 0\n", + "barrelsA08 0\n", + "charge120 0\n", + "charge240 0\n", + "city08 0\n", + " ... \n", + "modifiedOn 0\n", + "startStop 31705\n", + "phevCity 0\n", + "phevHwy 0\n", + "phevComb 0\n", + "Length: 83, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zCunTtrtQOST", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b906e11a-5662-40ed-8f7c-1d663d76a6e2" + }, + "source": [ + "#difficult to read, filter out the columns with no nulls\n", + "#Get rid of all the columns with zero nulls\n", + "ds_null = ds.isnull().sum() ## gives you a series with the total number of null per column\n", + "ds_null\n", + "\n", + "ds.isnull().sum()[ds.isnull().sum() > 0]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "cylinders 123\n", + "displ 120\n", + "drive 1189\n", + "eng_dscr 15403\n", + "trany 11\n", + "guzzler 35562\n", + "trans_dscr 22796\n", + "tCharger 32657\n", + "sCharger 37177\n", + "atvType 34771\n", + "fuelType2 36435\n", + "rangeA 36440\n", + "evMotor 37281\n", + "mfrCode 30818\n", + "c240Dscr 37806\n", + "c240bDscr 37807\n", + "startStop 31705\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7aUh6H_6QOSV", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "78f86b7b-d747-4cb0-96d2-70bc0ae37cf2" + }, + "source": [ + "#Check for zero values\n", + "ds_zeros = ds[ds == 0.0].count()\n", + "ds_zeros\n", + "ds_zeros[ds_zeros > 0]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "barrelsA08 36435\n", + "charge120 37843\n", + "charge240 37691\n", + "city08U 30544\n", + "cityA08 36435\n", + "cityA08U 37103\n", + "cityCD 37831\n", + "cityE 37668\n", + "cityUF 37788\n", + "co2 120\n", + "co2A 15\n", + "co2TailpipeAGpm 36490\n", + "co2TailpipeGpm 120\n", + "comb08U 30544\n", + "combA08 36435\n", + "combA08U 37103\n", + "combE 37668\n", + "combinedCD 37828\n", + "combinedUF 37788\n", + "displ 2\n", + "engId 12600\n", + "fuelCostA08 36471\n", + "ghgScoreA 9\n", + "highway08U 30544\n", + "highwayA08 36435\n", + "highwayA08U 37103\n", + "highwayCD 37833\n", + "highwayE 37668\n", + "highwayUF 37788\n", + "hlv 33228\n", + "hpv 33229\n", + "lv2 31593\n", + "lv4 24449\n", + "phevBlended 37807\n", + "pv2 31604\n", + "pv4 24449\n", + "range 37723\n", + "rangeCity 37750\n", + "rangeCityA 37788\n", + "rangeHwy 37750\n", + "rangeHwyA 37788\n", + "UCity 25\n", + "UCityA 36481\n", + "UHighway 25\n", + "UHighwayA 36481\n", + "youSaveSpend 1234\n", + "charge240b 37807\n", + "phevCity 37788\n", + "phevHwy 37788\n", + "phevComb 37788\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xmWOokkPQOSX", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "outputId": "f595a1e4-15eb-41aa-ab3f-f3f4bb504d01" + }, + "source": [ + "#Make a list to drop\n", + "ds_null = ds.isnull().sum()\n", + "ds_null[ds_null > 10000]\n", + "list(ds_null[ds_null > 10000].index)\n", + "\n", + "ds_dropped = ds.drop(list(ds_null[ds_null > 10000].index), axis=1)\n", + "ds_dropped.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n", + "0 15.695714 0.0 0.0 0.0 19 0.0 0 \n", + "1 29.964545 0.0 0.0 0.0 9 0.0 0 \n", + "2 12.207778 0.0 0.0 0.0 23 0.0 0 \n", + "3 29.964545 0.0 0.0 0.0 10 0.0 0 \n", + "4 17.347895 0.0 0.0 0.0 17 0.0 0 \n", + "\n", + " cityA08U cityCD cityE ... UHighwayA VClass year \\\n", + "0 0.0 0.0 0.0 ... 0.0 Two Seaters 1985 \n", + "1 0.0 0.0 0.0 ... 0.0 Two Seaters 1985 \n", + "2 0.0 0.0 0.0 ... 0.0 Subcompact Cars 1985 \n", + "3 0.0 0.0 0.0 ... 0.0 Vans 1985 \n", + "4 0.0 0.0 0.0 ... 0.0 Compact Cars 1993 \n", + "\n", + " youSaveSpend charge240b createdOn \\\n", + "0 -1250 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "1 -8500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "2 500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "3 -8500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "4 -4000 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "\n", + " modifiedOn phevCity phevHwy phevComb \n", + "0 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "1 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "2 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "3 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "4 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "\n", + "[5 rows x 70 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...UHighwayAVClassyearyouSaveSpendcharge240bcreatedOnmodifiedOnphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...0.0Two Seaters1985-12500.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
129.9645450.00.00.090.000.00.00.0...0.0Two Seaters1985-85000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
212.2077780.00.00.0230.000.00.00.0...0.0Subcompact Cars19855000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
329.9645450.00.00.0100.000.00.00.0...0.0Vans1985-85000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
417.3478950.00.00.0170.000.00.00.0...0.0Compact Cars1993-40000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
\n", + "

5 rows × 70 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Mi2siOOXDNE5" + }, + "source": [], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LX5CglHvQOSY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7159da3b-e261-46d0-8e8e-e8f0f467afd1" + }, + "source": [ + "## exercise: do the same for negative values\n", + "#.numeric()\n", + "# select_dtypes()\n", + "numeric = ds.select_dtypes(exclude=object)\n", + "\n", + "ds_negatives = numeric[numeric < 0.0].count()\n", + "ds_negatives\n", + "ds_negatives[ds_negatives > 0]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "co2 31989\n", + "co2A 37318\n", + "feScore 32028\n", + "ghgScore 32028\n", + "ghgScoreA 37325\n", + "youSaveSpend 32526\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9-zwNH4RQOSa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "outputId": "08712edc-7b22-41b3-8b36-66c0f8cbef7e" + }, + "source": [ + "# Cylinders and displ have similar number of Null values. Maybe they have the same null entries?\n", + "# Let's check if it makes sense to drop them or keep them\n", + "null_displ = ds[ds['displ'].isnull()]\n", + "\n", + "null_displ = null_displ[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]\n", + "null_displ\n", + "#we seem to have found a strong correlation" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " year make model trany \\\n", + "7138 2000 Nissan Altra EV NaN \n", + "7139 2000 Toyota RAV4 EV NaN \n", + "8143 2001 Toyota RAV4 EV NaN \n", + "8144 2001 Ford Th!nk NaN \n", + "8146 2001 Ford Explorer USPS Electric NaN \n", + "... ... ... ... ... \n", + "30969 2017 Kia Soul Electric Automatic (A1) \n", + "30972 2016 Tesla Model S (60 kW-hr battery pack) Automatic (A1) \n", + "30973 2016 Tesla Model S AWD - 60D Automatic (A1) \n", + "30974 2016 Tesla Model S AWD - P100D Automatic (A1) \n", + "30975 2016 Tesla Model X AWD - 60D Automatic (A1) \n", + "\n", + " drive fuelType cylinders displ \n", + "7138 NaN Electricity NaN NaN \n", + "7139 2-Wheel Drive Electricity NaN NaN \n", + "8143 2-Wheel Drive Electricity NaN NaN \n", + "8144 NaN Electricity NaN NaN \n", + "8146 2-Wheel Drive Electricity NaN NaN \n", + "... ... ... ... ... \n", + "30969 Front-Wheel Drive Electricity NaN NaN \n", + "30972 Rear-Wheel Drive Electricity NaN NaN \n", + "30973 All-Wheel Drive Electricity NaN NaN \n", + "30974 All-Wheel Drive Electricity NaN NaN \n", + "30975 All-Wheel Drive Electricity NaN NaN \n", + "\n", + "[120 rows x 8 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearmakemodeltranydrivefuelTypecylindersdispl
71382000NissanAltra EVNaNNaNElectricityNaNNaN
71392000ToyotaRAV4 EVNaN2-Wheel DriveElectricityNaNNaN
81432001ToyotaRAV4 EVNaN2-Wheel DriveElectricityNaNNaN
81442001FordTh!nkNaNNaNElectricityNaNNaN
81462001FordExplorer USPS ElectricNaN2-Wheel DriveElectricityNaNNaN
...........................
309692017KiaSoul ElectricAutomatic (A1)Front-Wheel DriveElectricityNaNNaN
309722016TeslaModel S (60 kW-hr battery pack)Automatic (A1)Rear-Wheel DriveElectricityNaNNaN
309732016TeslaModel S AWD - 60DAutomatic (A1)All-Wheel DriveElectricityNaNNaN
309742016TeslaModel S AWD - P100DAutomatic (A1)All-Wheel DriveElectricityNaNNaN
309752016TeslaModel X AWD - 60DAutomatic (A1)All-Wheel DriveElectricityNaNNaN
\n", + "

120 rows × 8 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KRi2usDfQOSc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "outputId": "af4f2a0a-36e1-4580-c9d3-01a7b3d2583f" + }, + "source": [ + "# In case we decide they are important even with little data,\n", + "# we might want to fill the missing values with a value we choose. In this case we fill with zero.\n", + "#fillna has multiple options and similar functions you may want to check out: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html\n", + "ds[['displ', 'cylinders']] = ds[['displ', 'cylinders']].fillna(0)\n", + "ds[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']][29670:30000]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " year make model \\\n", + "29670 2016 Volvo XC90 AWD \n", + "29671 2016 Chevrolet Spark EV \n", + "29672 2016 Mercedes-Benz B250e \n", + "29673 2016 Hyundai Sonata Plug-in Hybrid \n", + "29674 2016 Jaguar XF AWD \n", + "... ... ... ... \n", + "29995 1987 GMC R15 Pickup 2WD \n", + "29996 2017 Mitsubishi Mirage \n", + "29997 2017 Mitsubishi Mirage \n", + "29998 2017 Chrysler Pacifica \n", + "29999 2017 Mitsubishi i-MiEV \n", + "\n", + " trany drive \\\n", + "29670 Automatic (S8) All-Wheel Drive \n", + "29671 Automatic (A1) Front-Wheel Drive \n", + "29672 Automatic (A1) Front-Wheel Drive \n", + "29673 Auto(AM6) Front-Wheel Drive \n", + "29674 Automatic (S8) All-Wheel Drive \n", + "... ... ... \n", + "29995 Automatic 3-spd Rear-Wheel Drive \n", + "29996 Manual 5-spd Front-Wheel Drive \n", + "29997 Automatic (variable gear ratios) Front-Wheel Drive \n", + "29998 Automatic 9-spd Front-Wheel Drive \n", + "29999 Automatic (A1) Rear-Wheel Drive \n", + "\n", + " fuelType cylinders displ \n", + "29670 Premium 4.0 2.0 \n", + "29671 Electricity 0.0 0.0 \n", + "29672 Electricity 0.0 0.0 \n", + "29673 Regular Gas and Electricity 4.0 2.0 \n", + "29674 Premium 6.0 3.0 \n", + "... ... ... ... \n", + "29995 Regular 6.0 4.3 \n", + "29996 Regular 3.0 1.2 \n", + "29997 Regular 3.0 1.2 \n", + "29998 Regular 6.0 3.6 \n", + "29999 Electricity 0.0 0.0 \n", + "\n", + "[330 rows x 8 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearmakemodeltranydrivefuelTypecylindersdispl
296702016VolvoXC90 AWDAutomatic (S8)All-Wheel DrivePremium4.02.0
296712016ChevroletSpark EVAutomatic (A1)Front-Wheel DriveElectricity0.00.0
296722016Mercedes-BenzB250eAutomatic (A1)Front-Wheel DriveElectricity0.00.0
296732016HyundaiSonata Plug-in HybridAuto(AM6)Front-Wheel DriveRegular Gas and Electricity4.02.0
296742016JaguarXF AWDAutomatic (S8)All-Wheel DrivePremium6.03.0
...........................
299951987GMCR15 Pickup 2WDAutomatic 3-spdRear-Wheel DriveRegular6.04.3
299962017MitsubishiMirageManual 5-spdFront-Wheel DriveRegular3.01.2
299972017MitsubishiMirageAutomatic (variable gear ratios)Front-Wheel DriveRegular3.01.2
299982017ChryslerPacificaAutomatic 9-spdFront-Wheel DriveRegular6.03.6
299992017Mitsubishii-MiEVAutomatic (A1)Rear-Wheel DriveElectricity0.00.0
\n", + "

330 rows × 8 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OVJi80iiV1a3" + }, + "source": [ + "# filling NaN values with 0 is often not correct. fillna needs to be done cautiously.\n", + "# Popular choices are to fill NaN values with the average or median of the remaining values of the same column\n", + "# Sometimes we can even create predictive models to fill in the missing values from other values of the same row\n", + "\n", + "#These methods of filling NaNs are called *imputation* and are a very popular interview question" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oB4cmM0cQOSh" + }, + "source": [ + "## Data Cleaning 2 - Data Type Correction\n", + "\n", + "Sometimes you might want to change the type of a value" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KL5iq0l8QOSi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "64059650-67c7-4a79-fe96-f943011d6a4f" + }, + "source": [ + "ds['cylinders'].dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y2nILi1oQOSj", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c73ca630-629f-415d-8a24-c1aafd1d6855" + }, + "source": [ + "ds['cylinders'] = ds['cylinders'].astype('int64')\n", + "ds['cylinders'].dtypes" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WEwRooyuQOSl" + }, + "source": [ + "## Data Cleaning 3 - Cleaning Text and Removing Special Symbols" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2SR_VaksQOSm", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "41e7cdcc-6abf-4160-f6e8-9a44d630702b" + }, + "source": [ + "print(set(ds['trany']))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{nan, 'Manual 5-spd', 'Automatic 6-spd', 'Auto(AM-S6)', 'Automatic (S8)', 'Manual 4-spd', 'Auto(AM6)', 'Manual 6-spd', 'Automatic 3-spd', 'Auto(AM-S9)', 'Auto(L3)', 'Auto(AV-S6)', 'Automatic (S4)', 'Automatic (S6)', 'Automatic 7-spd', 'Auto(AM-S8)', 'Manual 3-spd', 'Auto(AM5)', 'Auto(AV-S8)', 'Auto(AM8)', 'Automatic (A1)', 'Manual 5 spd', 'Automatic (S5)', 'Automatic (AM5)', 'Automatic (A6)', 'Auto(AM-S7)', 'Automatic 9-spd', 'Automatic (AV)', 'Automatic 6spd', 'Automatic 8-spd', 'Auto (AV-S8)', 'Auto (AV)', 'Automatic (S9)', 'Automatic (variable gear ratios)', 'Automatic (AV-S6)', 'Auto (AV-S6)', 'Automatic (S7)', 'Auto(L4)', 'Manual(M7)', 'Automatic 5-spd', 'Automatic (AM6)', 'Auto(A1)', 'Manual 7-spd', 'Manual 4-spd Doubled', 'Auto(AV-S7)', 'Automatic 4-spd', 'Auto(AM7)'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWEHov9wQOSo", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2ec30e71-4292-48bd-fc08-722b820e1216" + }, + "source": [ + "ds['trany'] = ds['trany'].str.replace('-', '')\n", + "print(set(ds['trany']))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{nan, 'Automatic 9spd', 'Automatic 3spd', 'Automatic (S8)', 'Manual 7spd', 'Automatic 4spd', 'Auto(AM6)', 'Auto(AVS8)', 'Manual 6spd', 'Auto(AMS7)', 'Manual 5spd', 'Automatic 5spd', 'Auto(L3)', 'Automatic (S4)', 'Automatic (S6)', 'Auto(AMS9)', 'Auto(AVS6)', 'Automatic (AVS6)', 'Auto(AM5)', 'Manual 4spd Doubled', 'Auto(AM8)', 'Automatic (A1)', 'Auto(AVS7)', 'Manual 5 spd', 'Automatic (S5)', 'Auto (AVS6)', 'Auto (AVS8)', 'Automatic (AM5)', 'Automatic (A6)', 'Automatic 6spd', 'Automatic (AV)', 'Auto (AV)', 'Automatic (S9)', 'Manual 3spd', 'Automatic (variable gear ratios)', 'Auto(AMS6)', 'Automatic (S7)', 'Auto(L4)', 'Manual(M7)', 'Automatic 8spd', 'Automatic (AM6)', 'Auto(AMS8)', 'Auto(A1)', 'Automatic 7spd', 'Manual 4spd', 'Auto(AM7)'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pp-nOHntV1bC", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1e08d680-778e-407b-ab4d-66611e2a7c49" + }, + "source": [ + "#the same technique can be used to normalize the data\n", + "ds['trany'] = ds['trany'].str.replace('Automatic', 'Auto')\n", + "print(set(ds['trany']))\n", + "\n", + "ds['trany'] = ds['trany'].str.replace(\"Auto\\(\", \"Auto (\")\n", + "print(set(ds['trany']))\n", + "\n", + "#regex is often your friend here" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{nan, 'Auto 8spd', 'Auto 5spd', 'Auto (A6)', 'Auto (variable gear ratios)', 'Auto (S6)', 'Manual 7spd', 'Auto(AM6)', 'Auto (AM6)', 'Auto(AVS8)', 'Manual 6spd', 'Auto(AMS7)', 'Manual 5spd', 'Auto(L3)', 'Auto (S4)', 'Auto (AM5)', 'Auto(AMS9)', 'Auto(AVS6)', 'Auto(AM5)', 'Manual 4spd Doubled', 'Auto 7spd', 'Auto(AM8)', 'Auto(AVS7)', 'Auto 4spd', 'Auto (A1)', 'Manual 5 spd', 'Auto 9spd', 'Auto (AVS6)', 'Auto 3spd', 'Auto (AVS8)', 'Auto (AV)', 'Manual 3spd', 'Auto (S8)', 'Auto (S5)', 'Auto(AMS6)', 'Manual(M7)', 'Auto(L4)', 'Auto (S7)', 'Auto(AMS8)', 'Auto(A1)', 'Auto 6spd', 'Auto (S9)', 'Manual 4spd', 'Auto(AM7)'}\n", + "{nan, 'Auto 8spd', 'Auto 5spd', 'Auto (A6)', 'Auto (variable gear ratios)', 'Auto (S6)', 'Manual 7spd', 'Auto (AM7)', 'Auto (AM6)', 'Manual 6spd', 'Manual 5spd', 'Auto (S4)', 'Auto (AM5)', 'Auto (L3)', 'Manual 4spd Doubled', 'Auto (AM8)', 'Auto (AVS7)', 'Auto 7spd', 'Auto 4spd', 'Auto (A1)', 'Manual 5 spd', 'Auto 9spd', 'Auto (AVS6)', 'Auto 3spd', 'Auto (AVS8)', 'Auto (AMS9)', 'Auto (AMS6)', 'Auto (AV)', 'Manual 3spd', 'Auto (S8)', 'Auto (AMS8)', 'Auto (S5)', 'Manual(M7)', 'Auto (L4)', 'Auto (S7)', 'Auto 6spd', 'Auto (AMS7)', 'Auto (S9)', 'Manual 4spd'}\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " \"\"\"\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iALK32zAQOSq" + }, + "source": [ + "## Data Cleaning 4 - Removing Duplicates" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UK305SgKFOkD" + }, + "source": [ + "# concatenating (put two dataframes together, like merge or join)\n", + "frames = [ds, ds]\n", + "\n", + "ds = pd.concat(frames)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-AIOPGuFQOSr", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + }, + "outputId": "3a387020-134d-4972-aa07-28b80a98d02f" + }, + "source": [ + "#drop duplicates removes all duplicated lines\n", + "before = len(ds)\n", + "display(before)\n", + "ds = ds.drop_duplicates()\n", + "ds\n", + "after = len(ds)\n", + "display(after)\n", + "print('Number of duplicate records dropped: ', str(before - after))\n", + "\n", + "# you can also remove lines that have duplicates only on some columns\n", + "before = len(ds)\n", + "droppedsubset = ds.drop_duplicates(subset=['make', 'model', 'year', 'displ', 'cylinders',\n", + " 'trany', 'drive', 'VClass','fuelType','barrels08',\n", + " 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08'])\n", + "after = len(droppedsubset)\n", + "print('Number of duplicate records dropped, considering only some columns: ', str(before - after))\n", + "\n", + "# by default you retain the first copy of the dropped lines from each replication\n", + "before = len(ds)\n", + "losefirst = ds.drop_duplicates(subset=['make', 'model', 'year', 'displ', 'cylinders',\n", + " 'trany', 'drive', 'VClass','fuelType','barrels08',\n", + " 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08'],keep=False)\n", + "after = len(losefirst)\n", + "print('Number of duplicate records dropped, if not keeping first instance in dataset: ', str(before - after))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "75686" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "37843" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of duplicate records dropped: 37843\n", + "Number of duplicate records dropped, considering only some columns: 885\n", + "Number of duplicate records dropped, if not keeping first instance in dataset: 1739\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Qjrxxa1fQOSw" + }, + "source": [], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file