From 08450138564349cd4ab319cb5374f0dc781c4b15 Mon Sep 17 00:00:00 2001 From: ArisGoulas Date: Mon, 29 May 2023 19:26:06 +0100 Subject: [PATCH] lab done --- your-code/main.ipynb | 855 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 754 insertions(+), 101 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 8a9fa9e..05ed388 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,11 +12,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "\n", + "import numpy as np\n", + "import pandas as pd" ] }, { @@ -37,11 +40,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "from sklearn.datasets import load_diabetes\n", + "diabetes = load_diabetes()" ] }, { @@ -53,11 +59,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "diabetes.keys()" ] }, { @@ -73,13 +92,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - age age in years\n", + " - sex\n", + " - bmi body mass index\n", + " - bp average blood pressure\n", + " - s1 tc, total serum cholesterol\n", + " - s2 ldl, low-density lipoproteins\n", + " - s3 hdl, high-density lipoproteins\n", + " - s4 tch, total cholesterol / HDL\n", + " - s5 ltg, possibly log of serum triglycerides level\n", + " - s6 glu, blood sugar level\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n", + "\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "print(diabetes[\"DESCR\"])" ] }, { @@ -97,11 +164,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "\n", + "# 10 attributes; age (in years), sex (gender), bmi (body mass index) and bp (average blood pressure) are self-explanatory\n", + "# s1-s6 are 6 parameters used to predict the possibility to develop diabets, they signify the total serum cholesterol (s1: tc)\n", + "# low-density (s2: ldl) and high-density (s3: hdl) lipoproteins, the total cholesterol (s4: tch)\n", + "# and the blood sugar level (s6: glu); s5 (ltg) is possibly the log of the serum triglycerides level\n", + "\n", + "# diabetes[\"data\"] is the sum of the 10 attributes, while diabetes[\"target\"] is the value we want to predict (a quantitative\n", + "# measure of the disease progression)\n", + "\n", + "# 442 records" ] }, { @@ -115,11 +192,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10)\n", + "(442,)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "print(diabetes[\"data\"].shape)\n", + "print(diabetes[\"target\"].shape)" ] }, { @@ -156,11 +245,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +263,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +283,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "diabetes_data_train, diabetes_data_test, diabetes_target_train, diabetes_target_test = train_test_split(diabetes[\"data\"], \n", + " diabetes[\"target\"], \n", + " shuffle=False, \n", + " test_size=20)" ] }, { @@ -206,11 +306,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "152.76429169049118\n", + "[ 3.06094248e-01 -2.37635570e+02 5.10538048e+02 3.27729878e+02\n", + " -8.14111926e+02 4.92799595e+02 1.02841240e+02 1.84603496e+02\n", + " 7.43509388e+02 7.60966464e+01]\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "\n", + "print(diabetes_model.intercept_)\n", + "\n", + "print(diabetes_model.coef_)" ] }, { @@ -231,11 +348,29 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([197.61898486, 155.44031962, 172.88875144, 111.53270645,\n", + " 164.79397301, 131.06765869, 259.12441219, 100.47873746,\n", + " 117.06005372, 124.30261597, 218.36868146, 61.19581944,\n", + " 132.24837933, 120.33293546, 52.54513009, 194.03746764,\n", + " 102.5756431 , 123.56778709, 211.03465323, 52.60221696])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "y_pred = diabetes_model.predict(diabetes_data_test)\n", + "y_pred" ] }, { @@ -247,11 +382,175 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_valuespredictions
0233.0197.618985
191.0155.440320
2111.0172.888751
3152.0111.532706
4120.0164.793973
567.0131.067659
6310.0259.124412
794.0100.478737
8183.0117.060054
966.0124.302616
10173.0218.368681
1172.061.195819
1249.0132.248379
1364.0120.332935
1448.052.545130
15178.0194.037468
16104.0102.575643
17132.0123.567787
18220.0211.034653
1957.052.602217
\n", + "
" + ], + "text/plain": [ + " target_values predictions\n", + "0 233.0 197.618985\n", + "1 91.0 155.440320\n", + "2 111.0 172.888751\n", + "3 152.0 111.532706\n", + "4 120.0 164.793973\n", + "5 67.0 131.067659\n", + "6 310.0 259.124412\n", + "7 94.0 100.478737\n", + "8 183.0 117.060054\n", + "9 66.0 124.302616\n", + "10 173.0 218.368681\n", + "11 72.0 61.195819\n", + "12 49.0 132.248379\n", + "13 64.0 120.332935\n", + "14 48.0 52.545130\n", + "15 178.0 194.037468\n", + "16 104.0 102.575643\n", + "17 132.0 123.567787\n", + "18 220.0 211.034653\n", + "19 57.0 52.602217" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "X = np.array(diabetes_target_test)\n", + "Y = np.array(y_pred)\n", + "\n", + "pd.DataFrame({\"target_values\": X, \"predictions\": Y})" ] }, { @@ -263,11 +562,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your explanation here:\n" + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy test 0.585085427447195\n" + ] + } + ], + "source": [ + "# Your explanation here:\n", + "\n", + "print(\"accuracy test\", diabetes_model.score(diabetes_data_test, diabetes_target_test))\n", + "\n", + "# No it is not the same as the accuracy of the model is low (0.58)" ] }, { @@ -302,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -326,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -351,11 +662,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "auto = pd.read_csv(\"../auto-mpg.csv\")" ] }, { @@ -367,11 +680,125 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto.head()" ] }, { @@ -383,11 +810,32 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg float64\n", + "cylinders int64\n", + "displacement float64\n", + "horse_power float64\n", + "weight int64\n", + "acceleration float64\n", + "model_year int64\n", + "car_name object\n", + "dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto.dtypes" ] }, { @@ -399,11 +847,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "70\n", + "82\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "\n", + "print(auto.model_year.min())\n", + "print(auto.model_year.max())" ] }, { @@ -415,11 +875,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "auto = auto.dropna()" ] }, { @@ -431,11 +921,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto[\"cylinders\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# 5 values of cylinders (3, 4, 5, 6, 8)" ] }, { @@ -451,11 +968,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "auto.drop(columns=[\"car_name\"], axis = 1, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "features = auto.drop(columns=[\"mpg\"], axis=1)\n", + "labels = auto[\"mpg\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20)" ] }, { @@ -469,11 +1007,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "auto_model = LinearRegression()\n", + "auto_model.fit(X_train, y_train)" ] }, { @@ -493,11 +1048,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8105881998994597" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "from sklearn.metrics import r2_score\n", + "\n", + "y_pred = auto_model.predict(X_train)\n", + "r2_score(y_train, y_pred) " ] }, { @@ -513,11 +1084,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7914518914629628" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r2_score(y_test, y_test_pred)\n", + "\n", + "# why did i get different results from below?" ] }, { @@ -542,11 +1129,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(features, labels, test_size=0.10)" ] }, { @@ -558,11 +1147,29 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "# Your code here:\n", + "auto_model09 = LinearRegression()\n", + "auto_model09.fit(X_train09, y_train09)" ] }, { @@ -574,11 +1181,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8090539562314758" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "\n", + "y_train_pred09 = auto_model09.predict(X_train09)\n", + "r_squared_train09 = r2_score(y_train09,y_train_pred09)\n", + "\n", + "r_squared_train09" ] }, { @@ -588,13 +1211,36 @@ "Compute the r squared score for the smaller test set. Is there an improvement in the test r squared?" ] }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8038984059621057" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test_pred09 = auto_model09.predict(X_test09)\n", + "r_squared_test09 = r2_score(y_test09,y_test_pred09)\n", + "r_squared_test09" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "#No? Re-run -> got different results based on different sampling..." ] }, { @@ -608,6 +1254,13 @@ "In the next cell, we will import RFE" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -703,7 +1356,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -717,7 +1370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.10.10" } }, "nbformat": 4,