From 08450138564349cd4ab319cb5374f0dc781c4b15 Mon Sep 17 00:00:00 2001
From: ArisGoulas <ari.goulas@gmail.com>
Date: Mon, 29 May 2023 19:26:06 +0100
Subject: [PATCH] lab done

---
 your-code/main.ipynb | 855 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 754 insertions(+), 101 deletions(-)

diff --git a/your-code/main.ipynb b/your-code/main.ipynb
index 8a9fa9e..05ed388 100644
--- a/your-code/main.ipynb
+++ b/your-code/main.ipynb
@@ -12,11 +12,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Import your libraries:\n"
+    "# Import your libraries:\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd"
    ]
   },
   {
@@ -37,11 +40,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "from sklearn.datasets import load_diabetes\n",
+    "diabetes = load_diabetes()"
    ]
   },
   {
@@ -53,11 +59,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "diabetes.keys()"
    ]
   },
   {
@@ -73,13 +92,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ".. _diabetes_dataset:\n",
+      "\n",
+      "Diabetes dataset\n",
+      "----------------\n",
+      "\n",
+      "Ten baseline variables, age, sex, body mass index, average blood\n",
+      "pressure, and six blood serum measurements were obtained for each of n =\n",
+      "442 diabetes patients, as well as the response of interest, a\n",
+      "quantitative measure of disease progression one year after baseline.\n",
+      "\n",
+      "**Data Set Characteristics:**\n",
+      "\n",
+      "  :Number of Instances: 442\n",
+      "\n",
+      "  :Number of Attributes: First 10 columns are numeric predictive values\n",
+      "\n",
+      "  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n",
+      "\n",
+      "  :Attribute Information:\n",
+      "      - age     age in years\n",
+      "      - sex\n",
+      "      - bmi     body mass index\n",
+      "      - bp      average blood pressure\n",
+      "      - s1      tc, total serum cholesterol\n",
+      "      - s2      ldl, low-density lipoproteins\n",
+      "      - s3      hdl, high-density lipoproteins\n",
+      "      - s4      tch, total cholesterol / HDL\n",
+      "      - s5      ltg, possibly log of serum triglycerides level\n",
+      "      - s6      glu, blood sugar level\n",
+      "\n",
+      "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n",
+      "\n",
+      "Source URL:\n",
+      "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n",
+      "\n",
+      "For more information see:\n",
+      "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n",
+      "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "print(diabetes[\"DESCR\"])"
    ]
   },
   {
@@ -97,11 +164,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Enter your answer here:\n"
+    "# Enter your answer here:\n",
+    "\n",
+    "# 10 attributes; age (in years), sex (gender), bmi (body mass index) and bp (average blood pressure) are self-explanatory\n",
+    "# s1-s6 are 6 parameters used to predict the possibility to develop diabets, they signify the total serum cholesterol (s1: tc)\n",
+    "# low-density (s2: ldl) and high-density (s3: hdl) lipoproteins, the total cholesterol (s4: tch)\n",
+    "# and the blood sugar level (s6: glu); s5 (ltg) is possibly the log of the serum triglycerides level\n",
+    "\n",
+    "# diabetes[\"data\"] is the sum of the 10 attributes, while diabetes[\"target\"] is the value we want to predict (a quantitative\n",
+    "# measure of the disease progression)\n",
+    "\n",
+    "# 442 records"
    ]
   },
   {
@@ -115,11 +192,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(442, 10)\n",
+      "(442,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "print(diabetes[\"data\"].shape)\n",
+    "print(diabetes[\"target\"].shape)"
    ]
   },
   {
@@ -156,11 +245,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "from sklearn.linear_model import LinearRegression"
    ]
   },
   {
@@ -172,11 +263,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "diabetes_model = LinearRegression()"
    ]
   },
   {
@@ -190,11 +283,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(diabetes[\"data\"], \n",
+    "                                                                                                        diabetes[\"target\"], \n",
+    "                                                                                                        shuffle=False, \n",
+    "                                                                                                        test_size=20)"
    ]
   },
   {
@@ -206,11 +306,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "152.76429169049118\n",
+      "[ 3.06094248e-01 -2.37635570e+02  5.10538048e+02  3.27729878e+02\n",
+      " -8.14111926e+02  4.92799595e+02  1.02841240e+02  1.84603496e+02\n",
+      "  7.43509388e+02  7.60966464e+01]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n",
+    "\n",
+    "print(diabetes_model.intercept_)\n",
+    "\n",
+    "print(diabetes_model.coef_)"
    ]
   },
   {
@@ -231,11 +348,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([197.61898486, 155.44031962, 172.88875144, 111.53270645,\n",
+       "       164.79397301, 131.06765869, 259.12441219, 100.47873746,\n",
+       "       117.06005372, 124.30261597, 218.36868146,  61.19581944,\n",
+       "       132.24837933, 120.33293546,  52.54513009, 194.03746764,\n",
+       "       102.5756431 , 123.56778709, 211.03465323,  52.60221696])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "y_pred = diabetes_model.predict(diabetes_data_test)\n",
+    "y_pred"
    ]
   },
   {
@@ -247,11 +382,175 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>target_values</th>\n",
+       "      <th>predictions</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>233.0</td>\n",
+       "      <td>197.618985</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>91.0</td>\n",
+       "      <td>155.440320</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>111.0</td>\n",
+       "      <td>172.888751</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>152.0</td>\n",
+       "      <td>111.532706</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>120.0</td>\n",
+       "      <td>164.793973</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>67.0</td>\n",
+       "      <td>131.067659</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>310.0</td>\n",
+       "      <td>259.124412</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>94.0</td>\n",
+       "      <td>100.478737</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>183.0</td>\n",
+       "      <td>117.060054</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>66.0</td>\n",
+       "      <td>124.302616</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>173.0</td>\n",
+       "      <td>218.368681</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>72.0</td>\n",
+       "      <td>61.195819</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>49.0</td>\n",
+       "      <td>132.248379</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>64.0</td>\n",
+       "      <td>120.332935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>48.0</td>\n",
+       "      <td>52.545130</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>178.0</td>\n",
+       "      <td>194.037468</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>104.0</td>\n",
+       "      <td>102.575643</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>132.0</td>\n",
+       "      <td>123.567787</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>220.0</td>\n",
+       "      <td>211.034653</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>57.0</td>\n",
+       "      <td>52.602217</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    target_values  predictions\n",
+       "0           233.0   197.618985\n",
+       "1            91.0   155.440320\n",
+       "2           111.0   172.888751\n",
+       "3           152.0   111.532706\n",
+       "4           120.0   164.793973\n",
+       "5            67.0   131.067659\n",
+       "6           310.0   259.124412\n",
+       "7            94.0   100.478737\n",
+       "8           183.0   117.060054\n",
+       "9            66.0   124.302616\n",
+       "10          173.0   218.368681\n",
+       "11           72.0    61.195819\n",
+       "12           49.0   132.248379\n",
+       "13           64.0   120.332935\n",
+       "14           48.0    52.545130\n",
+       "15          178.0   194.037468\n",
+       "16          104.0   102.575643\n",
+       "17          132.0   123.567787\n",
+       "18          220.0   211.034653\n",
+       "19           57.0    52.602217"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "X = np.array(diabetes_target_test)\n",
+    "Y = np.array(y_pred)\n",
+    "\n",
+    "pd.DataFrame({\"target_values\": X, \"predictions\": Y})"
    ]
   },
   {
@@ -263,11 +562,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your explanation here:\n"
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "accuracy test 0.585085427447195\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Your explanation here:\n",
+    "\n",
+    "print(\"accuracy test\", diabetes_model.score(diabetes_data_test, diabetes_target_test))\n",
+    "\n",
+    "# No it is not the same as the accuracy of the model is low (0.58)"
    ]
   },
   {
@@ -302,7 +613,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -326,7 +637,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -351,11 +662,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "auto = pd.read_csv(\"../auto-mpg.csv\")"
    ]
   },
   {
@@ -367,11 +680,125 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mpg</th>\n",
+       "      <th>cylinders</th>\n",
+       "      <th>displacement</th>\n",
+       "      <th>horse_power</th>\n",
+       "      <th>weight</th>\n",
+       "      <th>acceleration</th>\n",
+       "      <th>model_year</th>\n",
+       "      <th>car_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>18.0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>307.0</td>\n",
+       "      <td>130.0</td>\n",
+       "      <td>3504</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>70</td>\n",
+       "      <td>\\t\"chevrolet chevelle malibu\"</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>15.0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>350.0</td>\n",
+       "      <td>165.0</td>\n",
+       "      <td>3693</td>\n",
+       "      <td>11.5</td>\n",
+       "      <td>70</td>\n",
+       "      <td>\\t\"buick skylark 320\"</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>18.0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>318.0</td>\n",
+       "      <td>150.0</td>\n",
+       "      <td>3436</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>70</td>\n",
+       "      <td>\\t\"plymouth satellite\"</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>16.0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>304.0</td>\n",
+       "      <td>150.0</td>\n",
+       "      <td>3433</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>70</td>\n",
+       "      <td>\\t\"amc rebel sst\"</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>17.0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>302.0</td>\n",
+       "      <td>140.0</td>\n",
+       "      <td>3449</td>\n",
+       "      <td>10.5</td>\n",
+       "      <td>70</td>\n",
+       "      <td>\\t\"ford torino\"</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    mpg  cylinders  displacement  horse_power  weight  acceleration  \\\n",
+       "0  18.0          8         307.0        130.0    3504          12.0   \n",
+       "1  15.0          8         350.0        165.0    3693          11.5   \n",
+       "2  18.0          8         318.0        150.0    3436          11.0   \n",
+       "3  16.0          8         304.0        150.0    3433          12.0   \n",
+       "4  17.0          8         302.0        140.0    3449          10.5   \n",
+       "\n",
+       "   model_year                       car_name  \n",
+       "0          70  \\t\"chevrolet chevelle malibu\"  \n",
+       "1          70          \\t\"buick skylark 320\"  \n",
+       "2          70         \\t\"plymouth satellite\"  \n",
+       "3          70              \\t\"amc rebel sst\"  \n",
+       "4          70                \\t\"ford torino\"  "
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "auto.head()"
    ]
   },
   {
@@ -383,11 +810,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "mpg             float64\n",
+       "cylinders         int64\n",
+       "displacement    float64\n",
+       "horse_power     float64\n",
+       "weight            int64\n",
+       "acceleration    float64\n",
+       "model_year        int64\n",
+       "car_name         object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "auto.dtypes"
    ]
   },
   {
@@ -399,11 +847,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "70\n",
+      "82\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "print(auto.model_year.min())\n",
+    "print(auto.model_year.max())"
    ]
   },
   {
@@ -415,11 +875,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "mpg             0\n",
+       "cylinders       0\n",
+       "displacement    0\n",
+       "horse_power     6\n",
+       "weight          0\n",
+       "acceleration    0\n",
+       "model_year      0\n",
+       "car_name        0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "auto.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "auto = auto.dropna()"
    ]
   },
   {
@@ -431,11 +921,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4    199\n",
+       "8    103\n",
+       "6     83\n",
+       "3      4\n",
+       "5      3\n",
+       "Name: cylinders, dtype: int64"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "auto[\"cylinders\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# 5 values of cylinders (3, 4, 5, 6, 8)"
    ]
   },
   {
@@ -451,11 +968,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "auto.drop(columns=[\"car_name\"], axis = 1, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = auto.drop(columns=[\"mpg\"], axis=1)\n",
+    "labels = auto[\"mpg\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20)"
    ]
   },
   {
@@ -469,11 +1007,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LinearRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LinearRegression</label><div class=\"sk-toggleable__content\"><pre>LinearRegression()</pre></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "LinearRegression()"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "auto_model = LinearRegression()\n",
+    "auto_model.fit(X_train, y_train)"
    ]
   },
   {
@@ -493,11 +1048,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8105881998994597"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "from sklearn.metrics import r2_score\n",
+    "\n",
+    "y_pred = auto_model.predict(X_train)\n",
+    "r2_score(y_train, y_pred) "
    ]
   },
   {
@@ -513,11 +1084,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.7914518914629628"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "y_test_pred = auto_model.predict(X_test)\n",
+    "r2_score(y_test, y_test_pred)\n",
+    "\n",
+    "# why did i get different results from below?"
    ]
   },
   {
@@ -542,11 +1129,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "# Your code here:\n",
+    "\n",
+    "X_train09, X_test09, y_train09, y_test09 = train_test_split(features, labels, test_size=0.10)"
    ]
   },
   {
@@ -558,11 +1147,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LinearRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LinearRegression</label><div class=\"sk-toggleable__content\"><pre>LinearRegression()</pre></div></div></div></div></div>"
+      ],
+      "text/plain": [
+       "LinearRegression()"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "# Your code here:\n",
+    "auto_model09 = LinearRegression()\n",
+    "auto_model09.fit(X_train09, y_train09)"
    ]
   },
   {
@@ -574,11 +1181,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Your code here:\n"
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8090539562314758"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "\n",
+    "y_train_pred09 = auto_model09.predict(X_train09)\n",
+    "r_squared_train09 = r2_score(y_train09,y_train_pred09)\n",
+    "\n",
+    "r_squared_train09"
    ]
   },
   {
@@ -588,13 +1211,36 @@
     "Compute the r squared score for the smaller test set. Is there an improvement in the test r squared?"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.8038984059621057"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Your code here:\n",
+    "y_test_pred09 = auto_model09.predict(X_test09)\n",
+    "r_squared_test09 = r2_score(y_test09,y_test_pred09)\n",
+    "r_squared_test09"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here:\n"
+    "#No? Re-run -> got different results based on different sampling..."
    ]
   },
   {
@@ -608,6 +1254,13 @@
     "In the next cell, we will import RFE"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -703,7 +1356,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -717,7 +1370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,