From 77b302483a50b516790925aefa150d0958d3e423 Mon Sep 17 00:00:00 2001 From: leonordraiblate Date: Mon, 6 Mar 2023 10:56:20 +0000 Subject: [PATCH] Done --- your-code/main.ipynb | 600 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 510 insertions(+), 90 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 8a9fa9e..b0c9f8b 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "import numpy as np" ] }, { @@ -37,11 +39,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.datasets import load_diabetes\n", + "\n", + "diabetes = load_diabetes()" ] }, { @@ -53,11 +58,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +90,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - age age in years\n", + " - sex\n", + " - bmi body mass index\n", + " - bp average blood pressure\n", + " - s1 tc, total serum cholesterol\n", + " - s2 ldl, low-density lipoproteins\n", + " - s3 hdl, high-density lipoproteins\n", + " - s4 tch, total cholesterol / HDL\n", + " - s5 ltg, possibly log of serum triglycerides level\n", + " - s6 glu, blood sugar level\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes[\"DESCR\"])" ] }, { @@ -96,12 +159,23 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "* How many attributes are there in the data? What do they mean? \n", + " * age (age in years), \n", + " * sex, \n", + " * bmi (body mass index)\n", + " * bp average blood pressure\n", + " * s1 tc, total serum cholesterol\n", + " * s2 ldl, low-density lipoproteins\n", + " * s3 hdl, high-density lipoproteins\n", + " * s4 tch, total cholesterol / HDL\n", + " * s5 ltg, possibly log of serum triglycerides level\n", + " * s6 glu, blood sugar level\n", + "* What is the relation between diabetes['data'] and diabetes['target']? Diabetes data will help predict diabetetes target\n", + "* How many records are there in the data? 442" ] }, { @@ -115,11 +189,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10)\n", + "(442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes[\"data\"].shape)\n", + "print(diabetes[\"target\"].shape)" ] }, { @@ -156,11 +241,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +258,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +277,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "features = diabetes[\"data\"]\n", + "target = diabetes[\"target\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(features,target, random_state=0, test_size=0.2)" ] }, { @@ -206,11 +299,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "152.5381335195406\n", + "[ -35.55683674 -243.1692265 562.75404632 305.47203008 -662.78772128\n", + " 324.27527477 24.78193291 170.33056502 731.67810787 43.02846824]\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "diabetes_model=diabetes_model.fit(X_train,y_train)\n", + "\n", + "print(diabetes_model.intercept_)\n", + "print(diabetes_model.coef_)" ] }, { @@ -231,11 +338,46 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([238.47145247, 248.93170646, 164.05404165, 120.30794355,\n", + " 187.42422054, 259.04865002, 113.55556372, 188.07597044,\n", + " 149.49663441, 236.01099949, 172.24629506, 178.88073764,\n", + " 109.15751983, 92.13508975, 243.33042043, 87.356971 ,\n", + " 155.72606406, 66.99073989, 100.42610442, 218.09422877,\n", + " 196.66287912, 161.29832968, 161.70779605, 156.52520454,\n", + " 197.88796516, 167.57984206, 120.74478913, 84.83879727,\n", + " 192.03728687, 160.60687024, 175.17178362, 84.22833237,\n", + " 145.7995542 , 145.97333493, 140.96488953, 197.00421108,\n", + " 165.94322494, 190.65906468, 128.22520508, 206.41941223,\n", + " 84.35851196, 164.0256504 , 144.1056776 , 184.68355549,\n", + " 177.80238966, 74.32855231, 143.3660286 , 138.67726085,\n", + " 120.81146113, 234.34252077, 161.94390244, 74.5455476 ,\n", + " 154.71905074, 156.78884927, 237.42227096, 174.23053048,\n", + " 190.88212635, 118.98373473, 132.20418974, 168.52674824,\n", + " 214.74245466, 171.42364091, 157.37409906, 108.86927343,\n", + " 257.06329636, 152.17777143, 82.43686464, 231.56746032,\n", + " 202.90641336, 47.18340199, 78.46954525, 129.30170908,\n", + " 104.60253144, 144.65200281, 132.27974254, 190.04134164,\n", + " 97.55541138, 197.51891007, 219.13709291, 186.13797012,\n", + " 149.60913007, 208.42379455, 44.59036026, 206.20925368,\n", + " 76.77377721, 94.94046865, 145.2955051 , 194.03776373,\n", + " 132.78534336])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "predict=diabetes_model.predict(X_test)\n", + "predict" ] }, { @@ -247,11 +389,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([321., 215., 127., 64., 175., 275., 179., 232., 142., 99., 252.,\n", + " 174., 129., 74., 264., 49., 86., 75., 101., 155., 170., 276.,\n", + " 110., 136., 68., 128., 103., 93., 191., 196., 217., 181., 168.,\n", + " 200., 219., 281., 151., 257., 49., 198., 96., 179., 95., 198.,\n", + " 244., 89., 214., 182., 84., 270., 156., 138., 113., 131., 195.,\n", + " 171., 122., 61., 230., 235., 52., 121., 144., 107., 132., 302.,\n", + " 53., 317., 137., 57., 98., 170., 88., 90., 67., 163., 104.,\n", + " 186., 180., 283., 141., 150., 47., 297., 104., 49., 103., 142.,\n", + " 59.])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test" ] }, { @@ -263,11 +425,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "They aren't the same\n" + ] + } + ], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "print(\"They aren't the same\")" ] }, { @@ -302,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -326,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -351,11 +522,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv(\"../auto-mpg.csv\")" ] }, { @@ -367,11 +539,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head(5)" ] }, { @@ -383,11 +668,31 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg float64\n", + "cylinders int64\n", + "displacement float64\n", + "horse_power float64\n", + "weight int64\n", + "acceleration float64\n", + "model_year int64\n", + "car_name object\n", + "dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.dtypes " ] }, { @@ -399,11 +704,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "wrong_type_columns=[\"mpg\",\"displacement\",\"horse_power\",\"acceleration\"]\n", + "for i in wrong_type_columns:\n", + " auto[i]=pd.to_numeric(auto[i], errors='coerce')" ] }, { @@ -415,11 +723,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.dropna(inplace=True)" ] }, { @@ -431,11 +740,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto[\"cylinders\"].value_counts()" ] }, { @@ -451,11 +777,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.drop(columns=[\"car_name\"],inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "features= auto.drop(columns=[\"mpg\"])\n", + "target = auto[\"mpg\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0, test_size=0.2)" ] }, { @@ -469,11 +808,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model=LinearRegression()\n", + "auto_model.fit(X_train,y_train)" ] }, { @@ -493,11 +845,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R squared score: 0.8088490656511089\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "from sklearn.metrics import r2_score\n", + "\n", + "y_pred = auto_model.predict(X_train)\n", + "\n", + "# calculate the r squared score between y_pred and y_train\n", + "r_squared = r2_score(y_train, y_pred)\n", + "print(\"R squared score:\", r_squared)" ] }, { @@ -513,11 +880,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R squared score: 0.8088938602131773\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r_squared = r2_score(y_test, y_test_pred)\n", + "print(\"R squared score:\", r_squared)" ] }, { @@ -542,11 +920,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(features, target, random_state=0, test_size=0.2)" ] }, { @@ -558,11 +937,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09= LinearRegression()\n", + "auto_model09=auto_model09.fit(X_train09,y_train09)" ] }, { @@ -574,11 +955,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R squared score: 0.8088490656511089\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "pred_train= auto_model09.predict(X_train)\n", + "r_squared = r2_score(y_train09, pred_train)\n", + "print(\"R squared score:\", r_squared)" ] }, { @@ -590,11 +982,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R squared score: 0.8088938602131773\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "pred_test= auto_model09.predict(X_test)\n", + "r_squared = r2_score(y_test09, pred_test)\n", + "print(\"R squared score:\", r_squared)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The improvement is very minor\n" + ] + } + ], + "source": [ + "print(\"The improvement is very minor\")" ] }, { @@ -703,7 +1123,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -717,7 +1137,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.9.13" } }, "nbformat": 4,