From 6fea4c94182b3610b332199f2e3a9da2451b4927 Mon Sep 17 00:00:00 2001 From: Wilmailys Date: Wed, 13 Oct 2021 16:19:12 -0400 Subject: [PATCH 1/2] Add files via upload --- your-code/main.ipynb | 724 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 613 insertions(+), 111 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 0102ef9..63c139b 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,11 +12,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn.datasets import load_diabetes\n", + "from sklearn.model_selection import train_test_split" ] }, { @@ -37,11 +40,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes=load_diabetes()" ] }, { @@ -53,11 +57,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +89,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - age age in years\\n - sex\\n - bmi body mass index\\n - bp average blood pressure\\n - s1 tc, T-Cells (a type of white blood cells)\\n - s2 ldl, low-density lipoproteins\\n - s3 hdl, high-density lipoproteins\\n - s4 tch, thyroid stimulating hormone\\n - s5 ltg, lamotrigine\\n - s6 glu, blood sugar level\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.DESCR" ] }, { @@ -115,11 +143,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tiene 442 filas y 10 variables: (442, 10)\n", + "Tiene 442 filas y 1 variable: (442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "# Enter your answer here:\n", + "print('Tiene 442 filas y 10 variables: ',diabetes.data.shape)\n", + "print ('Tiene 442 filas y 1 variable:',diabetes.target.shape)" ] }, { @@ -156,11 +196,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression# Your code here:\n" ] }, { @@ -172,11 +213,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model=LinearRegression()" ] }, { @@ -190,11 +232,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_data_train=diabetes[\"data\"][:-20]\n", + "diabetes_target_train=diabetes[\"target\"][:-20]\n", + "diabetes_data_test=diabetes[\"data\"][-20:]\n", + "diabetes_target_test=diabetes[\"target\"][-20:]" ] }, { @@ -206,11 +252,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept: 152.76430691633442\n", + "\n", + "Coef : [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", + " 7.43519617e+02 7.60951722e+01]\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "print('Intercept: {}'.format(diabetes_model.intercept_))\n", + "print ('')\n", + "print('Coef : {}'.format(diabetes_model.coef_))" ] }, { @@ -231,11 +293,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "pred=diabetes_model.predict(diabetes_data_test)" ] }, { @@ -247,11 +310,36 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[233. 91. 111. 152. 120. 67. 310. 94. 183. 66. 173. 72. 49. 64.\n", + " 48. 178. 104. 132. 220. 57.]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([-0.1790396 , 0.41456433, 0.35796084, -0.36279636, 0.27184708,\n", + " 0.48882101, -0.19634592, 0.06448441, -0.56329947, 0.46904806,\n", + " 0.20775331, -0.17650302, 0.62949091, 0.46814386, 0.08649011,\n", + " 0.08265382, -0.01392789, -0.06825459, -0.04248292, -0.08358104])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes_target_test)\n", + "\n", + "(diabetes_target_test-pred)/-pred" ] }, { @@ -263,11 +351,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "#No.\n", + "#Hay variaciones de más del 50%." ] }, { @@ -302,11 +392,60 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y R-squared: 0.512\n", + "Model: OLS Adj. R-squared: 0.500\n", + "Method: Least Squares F-statistic: 43.16\n", + "Date: Mon, 11 Oct 2021 Prob (F-statistic): 4.64e-58\n", + "Time: 10:34:49 Log-Likelihood: -2281.1\n", + "No. Observations: 422 AIC: 4584.\n", + "Df Residuals: 411 BIC: 4629.\n", + "Df Model: 10 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 152.7643 2.658 57.469 0.000 147.539 157.990\n", + "x1 0.3035 61.286 0.005 0.996 -120.169 120.776\n", + "x2 -237.6393 62.837 -3.782 0.000 -361.162 -114.117\n", + "x3 510.5306 68.156 7.491 0.000 376.553 644.508\n", + "x4 327.7370 66.876 4.901 0.000 196.275 459.199\n", + "x5 -814.1317 424.044 -1.920 0.056 -1647.697 19.434\n", + "x6 492.8146 344.227 1.432 0.153 -183.850 1169.480\n", + "x7 102.8485 219.463 0.469 0.640 -328.561 534.258\n", + "x8 184.6065 167.336 1.103 0.271 -144.334 513.547\n", + "x9 743.5196 175.359 4.240 0.000 398.807 1088.232\n", + "x10 76.0952 68.293 1.114 0.266 -58.152 210.343\n", + "==============================================================================\n", + "Omnibus: 1.544 Durbin-Watson: 2.026\n", + "Prob(Omnibus): 0.462 Jarque-Bera (JB): 1.421\n", + "Skew: 0.004 Prob(JB): 0.491\n", + "Kurtosis: 2.716 Cond. No. 224.\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "import statsmodels.api as sm\n", + "\n", + "diabetes_data_train=sm.add_constant(diabetes_data_train)\n", + "modelo=sm.OLS(diabetes_target_train, diabetes_data_train)\n", + "\n", + "res=modelo.fit()\n", + "\n", + "print(res.summary())" ] }, { @@ -324,15 +463,6 @@ "1. How will you modify your linear reguression model according to the test results above?" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your answers here:" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -351,11 +481,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "# Your code here:\n", + "auto=pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +499,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head()" ] }, { @@ -387,7 +632,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.info()" ] }, { @@ -399,11 +645,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Year of oldest model : 82\n", + "Year of newest model : 70\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print('Year of oldest model : {}'.format(auto.model_year.max()))\n", + "print('Year of newest model : {}'.format(auto.model_year.min()))" ] }, { @@ -415,11 +672,39 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mpg 398\n", + "cylinders 398\n", + "displacement 398\n", + "horse_power 398\n", + "weight 398\n", + "acceleration 398\n", + "model_year 398\n", + "car_name 398\n", + "dtype: int64\n", + "mpg 392\n", + "cylinders 392\n", + "displacement 392\n", + "horse_power 392\n", + "weight 392\n", + "acceleration 392\n", + "model_year 392\n", + "car_name 392\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(auto.isnull().count())\n", + "auto=auto.dropna()\n", + "print(auto.isnull().count())" ] }, { @@ -431,11 +716,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.cylinders.value_counts()" ] }, { @@ -451,11 +753,122 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_year
018.08307.0130.0350412.070
115.08350.0165.0369311.570
218.08318.0150.0343611.070
316.08304.0150.0343312.070
417.08302.0140.0344910.570
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year \n", + "0 70 \n", + "1 70 \n", + "2 70 \n", + "3 70 \n", + "4 70 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto=auto.drop(columns=['car_name'])\n", + "X=auto.drop(columns=['mpg'])\n", + "y=auto.mpg\n", + "X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)\n", + "auto.head()" ] }, { @@ -469,11 +882,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model=LinearRegression().fit(X_train, y_train)" ] }, { @@ -502,11 +916,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.817273600467024" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.metrics import r2_score\n", + "y_pred=auto_model.predict(X_train)\n", + "r2_score(y_train, y_pred)\n" ] }, { @@ -522,11 +950,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7716702707284335" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test_pred=auto_model.predict(X_test)\n", + "\n", + "r2_score(y_test, y_test_pred)" ] }, { @@ -551,11 +993,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09=train_test_split(X, y, test_size=0.1)\n" ] }, { @@ -567,11 +1010,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09=LinearRegression().fit(X_train09, y_train09)" ] }, { @@ -583,11 +1027,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8024474177235424" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_pred09=auto_model.predict(X_train09)\n", + "\n", + "r2_score(y_train09, y_pred09)" ] }, { @@ -599,11 +1057,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.854889886423715" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "y_test_pred09=auto_model.predict(X_test09)\n", + "\n", + "r2_score(y_test09, y_test_pred09)" ] }, { @@ -619,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -635,11 +1107,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "selector=RFE(auto_model, n_features_to_select=3, step=1)" ] }, { @@ -651,11 +1124,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 3 4 2 1 1]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "selector.fit(X_train, y_train)\n", + "print(selector.ranking_)" ] }, { @@ -669,11 +1152,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_reduced = X[['cylinders','acceleration','model_year']]\n", + "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced=train_test_split(X_reduced, y, test_size=0.2)" ] }, { @@ -685,11 +1170,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here: \n" + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7170557766576493\n", + "0.7053771503093262\n" + ] + } + ], + "source": [ + "# Your code here: \n", + "auto_model_reduced=LinearRegression()\n", + "auto_model_reduced.fit(X_train_reduced, y_train_reduced)\n", + "\n", + "y_pred_reduced=auto_model_reduced.predict(X_train_reduced)\n", + "print(r2_score(y_train_reduced, y_pred_reduced))\n", + "\n", + "y_test_pred_reduced=auto_model_reduced.predict(X_test_reduced)\n", + "print(r2_score(y_test_reduced, y_test_pred_reduced))" ] }, { @@ -726,7 +1228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.8.8" } }, "nbformat": 4, From db62c90926538b4f99e8a5c6fca373cc10a4c5be Mon Sep 17 00:00:00 2001 From: Wilmailys Date: Wed, 13 Oct 2021 16:33:08 -0400 Subject: [PATCH 2/2] Add files via upload --- your-code/main.ipynb | 651 ++++++++++++------------------------------- 1 file changed, 185 insertions(+), 466 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 63c139b..2f8c09e 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -66,7 +66,7 @@ "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": { "scrolled": false }, @@ -100,7 +100,7 @@ "'.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - age age in years\\n - sex\\n - bmi body mass index\\n - bp average blood pressure\\n - s1 tc, T-Cells (a type of white blood cells)\\n - s2 ldl, low-density lipoproteins\\n - s3 hdl, high-density lipoproteins\\n - s4 tch, thyroid stimulating hormone\\n - s5 ltg, lamotrigine\\n - s6 glu, blood sugar level\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)'" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -125,11 +125,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((442, 10), (442,))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Enter your answer here:\n", + "diabetes['data'].shape, diabetes['target'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Tenemos 10 columnas en nuestros datos. Todas las columnas tienen datos numericos.\n", + "\n", + "#Tenemos:\n", + "#-la edad del paciente.\n", + "#-Sexo.\n", + "#-Masa corporal. \n", + "#-Presion sanguinea.\n", + "#-Mediciones de suero sanguíneo.\n", + "#Un total de 10 columnas y 442 registros." ] }, { @@ -143,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -157,7 +186,6 @@ ], "source": [ "# Your code here:\n", - "# Enter your answer here:\n", "print('Tiene 442 filas y 10 variables: ',diabetes.data.shape)\n", "print ('Tiene 442 filas y 1 variable:',diabetes.target.shape)" ] @@ -196,12 +224,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", - "from sklearn.linear_model import LinearRegression# Your code here:\n" + "from sklearn.linear_model import LinearRegression" ] }, { @@ -213,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -232,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -252,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -293,12 +321,28 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([197.61846908, 155.43979328, 172.88665147, 111.53537279,\n", + " 164.80054784, 131.06954875, 259.12237761, 100.47935157,\n", + " 117.0601052 , 124.30503555, 218.36632793, 61.19831284,\n", + " 132.25046751, 120.3332925 , 52.54458691, 194.03798088,\n", + " 102.57139702, 123.56604987, 211.0346317 , 52.60335674])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Your code here:\n", - "pred=diabetes_model.predict(diabetes_data_test)" + "y_pred = diabetes_model.predict(diabetes_data_test)\n", + "y_pred" ] }, { @@ -310,36 +354,45 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[233. 91. 111. 152. 120. 67. 310. 94. 183. 66. 173. 72. 49. 64.\n", - " 48. 178. 104. 132. 220. 57.]\n" - ] - }, { "data": { "text/plain": [ - "array([-0.1790396 , 0.41456433, 0.35796084, -0.36279636, 0.27184708,\n", - " 0.48882101, -0.19634592, 0.06448441, -0.56329947, 0.46904806,\n", - " 0.20775331, -0.17650302, 0.62949091, 0.46814386, 0.08649011,\n", - " 0.08265382, -0.01392789, -0.06825459, -0.04248292, -0.08358104])" + "array([233., 91., 111., 152., 120., 67., 310., 94., 183., 66., 173.,\n", + " 72., 49., 64., 48., 178., 104., 132., 220., 57.])" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Your code here:\n", - "print(diabetes_target_test)\n", - "\n", - "(diabetes_target_test-pred)/-pred" + "diabetes_target_test" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'r2_score' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mr2_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdiabetes_target_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'r2_score' is not defined" + ] + } + ], + "source": [ + "r2_score(diabetes_target_test, y_pred)" ] }, { @@ -351,13 +404,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Your explanation here:\n", - "#No.\n", - "#Hay variaciones de más del 50%." + "#No, no es igual y eso se debe a que el modelo no es capaz de predecir correctamente el 100% de las variables.\n", + "#Este modelo no es el mas adecuado" ] }, { @@ -392,50 +445,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " OLS Regression Results \n", - "==============================================================================\n", - "Dep. Variable: y R-squared: 0.512\n", - "Model: OLS Adj. R-squared: 0.500\n", - "Method: Least Squares F-statistic: 43.16\n", - "Date: Mon, 11 Oct 2021 Prob (F-statistic): 4.64e-58\n", - "Time: 10:34:49 Log-Likelihood: -2281.1\n", - "No. Observations: 422 AIC: 4584.\n", - "Df Residuals: 411 BIC: 4629.\n", - "Df Model: 10 \n", - "Covariance Type: nonrobust \n", - "==============================================================================\n", - " coef std err t P>|t| [0.025 0.975]\n", - "------------------------------------------------------------------------------\n", - "const 152.7643 2.658 57.469 0.000 147.539 157.990\n", - "x1 0.3035 61.286 0.005 0.996 -120.169 120.776\n", - "x2 -237.6393 62.837 -3.782 0.000 -361.162 -114.117\n", - "x3 510.5306 68.156 7.491 0.000 376.553 644.508\n", - "x4 327.7370 66.876 4.901 0.000 196.275 459.199\n", - "x5 -814.1317 424.044 -1.920 0.056 -1647.697 19.434\n", - "x6 492.8146 344.227 1.432 0.153 -183.850 1169.480\n", - "x7 102.8485 219.463 0.469 0.640 -328.561 534.258\n", - "x8 184.6065 167.336 1.103 0.271 -144.334 513.547\n", - "x9 743.5196 175.359 4.240 0.000 398.807 1088.232\n", - "x10 76.0952 68.293 1.114 0.266 -58.152 210.343\n", - "==============================================================================\n", - "Omnibus: 1.544 Durbin-Watson: 2.026\n", - "Prob(Omnibus): 0.462 Jarque-Bera (JB): 1.421\n", - "Skew: 0.004 Prob(JB): 0.491\n", - "Kurtosis: 2.716 Cond. No. 224.\n", - "==============================================================================\n", - "\n", - "Notes:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" - ] - } - ], + "outputs": [], "source": [ "# Your code here:\n", "import statsmodels.api as sm\n", @@ -463,6 +475,16 @@ "1. How will you modify your linear reguression model according to the test results above?" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Your answers here:\n", + "#Podemos ver que el valor del p-value es menor a 0.05 podemos rechazar la hipotesis nula. " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -481,13 +503,12 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", - "# Your code here:\n", - "auto=pd.read_csv('../auto-mpg.csv')" + "auto = pd.read_csv(r'../auto-mpg.csv')" ] }, { @@ -499,124 +520,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", - "
" - ], - "text/plain": [ - " mpg cylinders displacement horse_power weight acceleration \\\n", - "0 18.0 8 307.0 130.0 3504 12.0 \n", - "1 15.0 8 350.0 165.0 3693 11.5 \n", - "2 18.0 8 318.0 150.0 3436 11.0 \n", - "3 16.0 8 304.0 150.0 3433 12.0 \n", - "4 17.0 8 302.0 140.0 3449 10.5 \n", - "\n", - " model_year car_name \n", - "0 70 \\t\"chevrolet chevelle malibu\" \n", - "1 70 \\t\"buick skylark 320\" \n", - "2 70 \\t\"plymouth satellite\" \n", - "3 70 \\t\"amc rebel sst\" \n", - "4 70 \\t\"ford torino\" " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "auto.head()" + "auto.head(5)" ] }, { @@ -645,18 +554,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Year of oldest model : 82\n", - "Year of newest model : 70\n" - ] - } - ], + "outputs": [], "source": [ "# Your code here:\n", "print('Year of oldest model : {}'.format(auto.model_year.max()))\n", @@ -672,39 +572,24 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mpg 398\n", - "cylinders 398\n", - "displacement 398\n", - "horse_power 398\n", - "weight 398\n", - "acceleration 398\n", - "model_year 398\n", - "car_name 398\n", - "dtype: int64\n", - "mpg 392\n", - "cylinders 392\n", - "displacement 392\n", - "horse_power 392\n", - "weight 392\n", - "acceleration 392\n", - "model_year 392\n", - "car_name 392\n", - "dtype: int64\n" - ] - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "print(auto.isnull().count())\n", - "auto=auto.dropna()\n", - "print(auto.isnull().count())" + "auto.isnull().sum()\n", + "\n", + "# Valores nulos en horse power que seran eliminados." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auto.dropna(inplace=True)\n", + "auto.isnull().sum()" ] }, { @@ -716,28 +601,14 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 199\n", - "8 103\n", - "6 83\n", - "3 4\n", - "5 3\n", - "Name: cylinders, dtype: int64" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "auto.cylinders.value_counts()" + "auto['cylinders'].value_counts()\n", + "\n", + "# Tenemos 5 valores posibles de cilindros" ] }, { @@ -753,122 +624,15 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_year
018.08307.0130.0350412.070
115.08350.0165.0369311.570
218.08318.0150.0343611.070
316.08304.0150.0343312.070
417.08302.0140.0344910.570
\n", - "
" - ], - "text/plain": [ - " mpg cylinders displacement horse_power weight acceleration \\\n", - "0 18.0 8 307.0 130.0 3504 12.0 \n", - "1 15.0 8 350.0 165.0 3693 11.5 \n", - "2 18.0 8 318.0 150.0 3436 11.0 \n", - "3 16.0 8 304.0 150.0 3433 12.0 \n", - "4 17.0 8 302.0 140.0 3449 10.5 \n", - "\n", - " model_year \n", - "0 70 \n", - "1 70 \n", - "2 70 \n", - "3 70 \n", - "4 70 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "auto=auto.drop(columns=['car_name'])\n", - "X=auto.drop(columns=['mpg'])\n", - "y=auto.mpg\n", - "X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)\n", - "auto.head()" + "x = auto.drop(columns=['mpg', 'car_name'])\n", + "y = auto['mpg']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" ] }, { @@ -882,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -916,25 +680,15 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.817273600467024" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "# Your code here:\n", "from sklearn.metrics import r2_score\n", + "\n", + "# Your code here:\n", "y_pred=auto_model.predict(X_train)\n", - "r2_score(y_train, y_pred)\n" + "r2_score(y_train, y_pred)" ] }, { @@ -950,24 +704,12 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7716702707284335" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "y_test_pred=auto_model.predict(X_test)\n", - "\n", + "y_test_pred = auto_model.predict(X_test)\n", "r2_score(y_test, y_test_pred)" ] }, @@ -993,12 +735,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", - "X_train09, X_test09, y_train09, y_test09=train_test_split(X, y, test_size=0.1)\n" + "X_train09, X_test09, y_train09, y_test09=train_test_split(X, y, test_size=0.1)" ] }, { @@ -1010,7 +752,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1027,20 +769,9 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.8024474177235424" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", "y_pred09=auto_model.predict(X_train09)\n", @@ -1057,20 +788,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.854889886423715" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Your code here:\n", "y_test_pred09=auto_model.predict(X_test09)\n", @@ -1091,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1107,12 +827,12 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", - "selector=RFE(auto_model, n_features_to_select=3, step=1)" + "selector = RFE(auto_model, n_features_to_select=3)" ] }, { @@ -1124,23 +844,32 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1 3 4 2 1 1]\n" - ] - } - ], + "outputs": [], "source": [ "# Your code here:\n", - "selector.fit(X_train, y_train)\n", + "selector.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "print(selector.ranking_)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.columns" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1152,13 +881,15 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Your code here:\n", - "X_reduced = X[['cylinders','acceleration','model_year']]\n", - "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced=train_test_split(X_reduced, y, test_size=0.2)" + "x_reduced = x[['cylinders', 'acceleration', 'model_year']]\n", + "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(x_reduced, y,\n", + " test_size=0.2,\n", + " random_state=42)" ] }, { @@ -1170,28 +901,16 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.7170557766576493\n", - "0.7053771503093262\n" - ] - } - ], + "outputs": [], "source": [ "# Your code here: \n", - "auto_model_reduced=LinearRegression()\n", + "auto_model_reduced = LinearRegression()\n", "auto_model_reduced.fit(X_train_reduced, y_train_reduced)\n", "\n", - "y_pred_reduced=auto_model_reduced.predict(X_train_reduced)\n", - "print(r2_score(y_train_reduced, y_pred_reduced))\n", - "\n", - "y_test_pred_reduced=auto_model_reduced.predict(X_test_reduced)\n", - "print(r2_score(y_test_reduced, y_test_pred_reduced))" + "y_pred_reduced = auto_model_reduced.predict(X_train_reduced)\n", + "r2_score(y_train_reduced, y_pred_reduced)" ] }, {