diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 0102ef9..2f8c09e 100755 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -12,11 +12,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn.datasets import load_diabetes\n", + "from sklearn.model_selection import train_test_split" ] }, { @@ -37,11 +40,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes=load_diabetes()" ] }, { @@ -53,11 +57,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.keys()" ] }, { @@ -73,13 +89,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'.. _diabetes_dataset:\\n\\nDiabetes dataset\\n----------------\\n\\nTen baseline variables, age, sex, body mass index, average blood\\npressure, and six blood serum measurements were obtained for each of n =\\n442 diabetes patients, as well as the response of interest, a\\nquantitative measure of disease progression one year after baseline.\\n\\n**Data Set Characteristics:**\\n\\n :Number of Instances: 442\\n\\n :Number of Attributes: First 10 columns are numeric predictive values\\n\\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\\n\\n :Attribute Information:\\n - age age in years\\n - sex\\n - bmi body mass index\\n - bp average blood pressure\\n - s1 tc, T-Cells (a type of white blood cells)\\n - s2 ldl, low-density lipoproteins\\n - s3 hdl, high-density lipoproteins\\n - s4 tch, thyroid stimulating hormone\\n - s5 ltg, lamotrigine\\n - s6 glu, blood sugar level\\n\\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\\n\\nSource URL:\\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\\n\\nFor more information see:\\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes.DESCR" ] }, { @@ -97,11 +125,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((442, 10), (442,))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Enter your answer here:\n", + "diabetes['data'].shape, diabetes['target'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Enter your answer here:\n" + "# Tenemos 10 columnas en nuestros datos. Todas las columnas tienen datos numericos.\n", + "\n", + "#Tenemos:\n", + "#-la edad del paciente.\n", + "#-Sexo.\n", + "#-Masa corporal. \n", + "#-Presion sanguinea.\n", + "#-Mediciones de suero sanguĂ­neo.\n", + "#Un total de 10 columnas y 442 registros." ] }, { @@ -115,11 +172,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tiene 442 filas y 10 variables: (442, 10)\n", + "Tiene 442 filas y 1 variable: (442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print('Tiene 442 filas y 10 variables: ',diabetes.data.shape)\n", + "print ('Tiene 442 filas y 1 variable:',diabetes.target.shape)" ] }, { @@ -156,11 +224,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +241,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model=LinearRegression()" ] }, { @@ -190,11 +260,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_data_train=diabetes[\"data\"][:-20]\n", + "diabetes_target_train=diabetes[\"target\"][:-20]\n", + "diabetes_data_test=diabetes[\"data\"][-20:]\n", + "diabetes_target_test=diabetes[\"target\"][-20:]" ] }, { @@ -206,11 +280,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept: 152.76430691633442\n", + "\n", + "Coef : [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", + " 7.43519617e+02 7.60951722e+01]\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model.fit(diabetes_data_train, diabetes_target_train)\n", + "print('Intercept: {}'.format(diabetes_model.intercept_))\n", + "print ('')\n", + "print('Coef : {}'.format(diabetes_model.coef_))" ] }, { @@ -231,11 +321,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([197.61846908, 155.43979328, 172.88665147, 111.53537279,\n", + " 164.80054784, 131.06954875, 259.12237761, 100.47935157,\n", + " 117.0601052 , 124.30503555, 218.36632793, 61.19831284,\n", + " 132.25046751, 120.3332925 , 52.54458691, 194.03798088,\n", + " 102.57139702, 123.56604987, 211.0346317 , 52.60335674])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred = diabetes_model.predict(diabetes_data_test)\n", + "y_pred" ] }, { @@ -247,11 +354,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([233., 91., 111., 152., 120., 67., 310., 94., 183., 66., 173.,\n", + " 72., 49., 64., 48., 178., 104., 132., 220., 57.])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_target_test" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'r2_score' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mr2_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdiabetes_target_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mNameError\u001b[0m: name 'r2_score' is not defined" + ] + } + ], + "source": [ + "r2_score(diabetes_target_test, y_pred)" ] }, { @@ -267,7 +408,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "#No, no es igual y eso se debe a que el modelo no es capaz de predecir correctamente el 100% de las variables.\n", + "#Este modelo no es el mas adecuado" ] }, { @@ -306,7 +449,15 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "import statsmodels.api as sm\n", + "\n", + "diabetes_data_train=sm.add_constant(diabetes_data_train)\n", + "modelo=sm.OLS(diabetes_target_train, diabetes_data_train)\n", + "\n", + "res=modelo.fit()\n", + "\n", + "print(res.summary())" ] }, { @@ -330,7 +481,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your answers here:" + "# Your answers here:\n", + "#Podemos ver que el valor del p-value es menor a 0.05 podemos rechazar la hipotesis nula. " ] }, { @@ -355,7 +507,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv(r'../auto-mpg.csv')" ] }, { @@ -371,7 +524,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.head(5)" ] }, { @@ -387,7 +541,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.info()" ] }, { @@ -403,7 +558,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print('Year of oldest model : {}'.format(auto.model_year.max()))\n", + "print('Year of newest model : {}'.format(auto.model_year.min()))" ] }, { @@ -419,7 +576,20 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.isnull().sum()\n", + "\n", + "# Valores nulos en horse power que seran eliminados." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auto.dropna(inplace=True)\n", + "auto.isnull().sum()" ] }, { @@ -435,7 +605,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto['cylinders'].value_counts()\n", + "\n", + "# Tenemos 5 valores posibles de cilindros" ] }, { @@ -455,7 +628,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "x = auto.drop(columns=['mpg', 'car_name'])\n", + "y = auto['mpg']\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)" ] }, { @@ -473,7 +650,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model=LinearRegression().fit(X_train, y_train)" ] }, { @@ -506,7 +684,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "from sklearn.metrics import r2_score\n", + "\n", + "# Your code here:\n", + "y_pred=auto_model.predict(X_train)\n", + "r2_score(y_train, y_pred)" ] }, { @@ -526,7 +708,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r2_score(y_test, y_test_pred)" ] }, { @@ -555,7 +739,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09=train_test_split(X, y, test_size=0.1)" ] }, { @@ -571,7 +756,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09=LinearRegression().fit(X_train09, y_train09)" ] }, { @@ -587,7 +773,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred09=auto_model.predict(X_train09)\n", + "\n", + "r2_score(y_train09, y_pred09)" ] }, { @@ -603,7 +792,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred09=auto_model.predict(X_test09)\n", + "\n", + "r2_score(y_test09, y_test_pred09)" ] }, { @@ -639,7 +831,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "selector = RFE(auto_model, n_features_to_select=3)" ] }, { @@ -655,7 +848,26 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "selector.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(selector.ranking_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.columns" ] }, { @@ -673,7 +885,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "x_reduced = x[['cylinders', 'acceleration', 'model_year']]\n", + "X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(x_reduced, y,\n", + " test_size=0.2,\n", + " random_state=42)" ] }, { @@ -689,7 +905,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Your code here: \n" + "# Your code here: \n", + "auto_model_reduced = LinearRegression()\n", + "auto_model_reduced.fit(X_train_reduced, y_train_reduced)\n", + "\n", + "y_pred_reduced = auto_model_reduced.predict(X_train_reduced)\n", + "r2_score(y_train_reduced, y_pred_reduced)" ] }, { @@ -726,7 +947,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.8.8" } }, "nbformat": 4,