diff --git a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb index 0102ef94..219ee3a5 100644 --- a/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb +++ b/module-3/lab-supervised-learning-sklearn/your-code/main.ipynb @@ -12,11 +12,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Import your libraries:\n" + "# Import your libraries:\n", + "import pandas as pd\n", + "from sklearn.datasets import load_diabetes" ] }, { @@ -37,11 +39,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes = load_diabetes(return_X_y=False)" ] }, { @@ -53,11 +56,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['data', 'target', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes.keys())" ] }, { @@ -73,13 +85,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "scrolled": false }, - "outputs": [], - "source": [ - "# Your code here:\n" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _diabetes_dataset:\n", + "\n", + "Diabetes dataset\n", + "----------------\n", + "\n", + "Ten baseline variables, age, sex, body mass index, average blood\n", + "pressure, and six blood serum measurements were obtained for each of n =\n", + "442 diabetes patients, as well as the response of interest, a\n", + "quantitative measure of disease progression one year after baseline.\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 442\n", + "\n", + " :Number of Attributes: First 10 columns are numeric predictive values\n", + "\n", + " :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n", + "\n", + " :Attribute Information:\n", + " - Age\n", + " - Sex\n", + " - Body mass index\n", + " - Average blood pressure\n", + " - S1\n", + " - S2\n", + " - S3\n", + " - S4\n", + " - S5\n", + " - S6\n", + "\n", + "Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).\n", + "\n", + "Source URL:\n", + "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n", + "\n", + "For more information see:\n", + "Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) \"Least Angle Regression,\" Annals of Statistics (with discussion), 407-499.\n", + "(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "print(diabetes.DESCR)" ] }, { @@ -97,11 +155,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. 10\n", + "2. diabetes[data] is the X, diabeyes[target] is the y, for each diabetes[data] there is a diabetes[target]\n", + "3. 442\n" + ] + } + ], "source": [ - "# Enter your answer here:\n" + "# Enter your answer here:\n", + "print(\"1.\", diabetes[\"data\"].shape[1])\n", + "print(\"2. diabetes[data] is the X, diabeyes[target] is the y, for each diabetes[data] there is a diabetes[target]\")\n", + "print(\"3.\", len(diabetes[\"target\"]))" ] }, { @@ -115,11 +186,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(442, 10)\n", + "(442,)\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(diabetes['data'].shape)\n", + "print(diabetes['target'].shape)" ] }, { @@ -156,11 +238,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.linear_model import LinearRegression" ] }, { @@ -172,11 +255,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_model = LinearRegression()" ] }, { @@ -190,11 +274,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_data_train = diabetes['data'][0:-20,]\n", + "diabetes_target_train = diabetes['target'][0:-20,]\n", + "diabetes_data_test = diabetes['data'][-20:]\n", + "diabetes_target_test = diabetes['target'][-20:]" ] }, { @@ -206,11 +294,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "reg = diabetes_model.fit(diabetes_data_train,diabetes_target_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept: 152.76430691633442 \n", + " Coefficients: [ 3.03499549e-01 -2.37639315e+02 5.10530605e+02 3.27736980e+02\n", + " -8.14131709e+02 4.92814588e+02 1.02848452e+02 1.84606489e+02\n", + " 7.43519617e+02 7.60951722e+01]\n" + ] + } + ], + "source": [ + "print(f'Intercept: {reg.intercept_} \\n Coefficients: {reg.coef_}')" ] }, { @@ -231,11 +340,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([197.61846908, 155.43979328, 172.88665147, 111.53537279,\n", + " 164.80054784, 131.06954875, 259.12237761, 100.47935157,\n", + " 117.0601052 , 124.30503555, 218.36632793, 61.19831284,\n", + " 132.25046751, 120.3332925 , 52.54458691, 194.03798088,\n", + " 102.57139702, 123.56604987, 211.0346317 , 52.60335674])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "reg.predict(diabetes_data_test)" ] }, { @@ -247,11 +372,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([233., 91., 111., 152., 120., 67., 310., 94., 183., 66., 173.,\n", + " 72., 49., 64., 48., 178., 104., 132., 220., 57.])" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "diabetes_target_test" ] }, { @@ -263,11 +401,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No because the data its not linear so it has an error\n" + ] + } + ], "source": [ - "# Your explanation here:\n" + "# Your explanation here:\n", + "print(\"No because the data its not linear so it has an error\")" ] }, { @@ -351,11 +498,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto = pd.read_csv('../auto-mpg.csv')" ] }, { @@ -367,11 +515,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgcylindersdisplacementhorse_powerweightaccelerationmodel_yearcar_name
018.08307.0130.0350412.070\\t\"chevrolet chevelle malibu\"
115.08350.0165.0369311.570\\t\"buick skylark 320\"
218.08318.0150.0343611.070\\t\"plymouth satellite\"
316.08304.0150.0343312.070\\t\"amc rebel sst\"
417.08302.0140.0344910.570\\t\"ford torino\"
\n", + "
" + ], + "text/plain": [ + " mpg cylinders displacement horse_power weight acceleration \\\n", + "0 18.0 8 307.0 130.0 3504 12.0 \n", + "1 15.0 8 350.0 165.0 3693 11.5 \n", + "2 18.0 8 318.0 150.0 3436 11.0 \n", + "3 16.0 8 304.0 150.0 3433 12.0 \n", + "4 17.0 8 302.0 140.0 3449 10.5 \n", + "\n", + " model_year car_name \n", + "0 70 \\t\"chevrolet chevelle malibu\" \n", + "1 70 \\t\"buick skylark 320\" \n", + "2 70 \\t\"plymouth satellite\" \n", + "3 70 \\t\"amc rebel sst\" \n", + "4 70 \\t\"ford torino\" " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.head()" ] }, { @@ -383,11 +644,32 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 398 entries, 0 to 397\n", + "Data columns (total 8 columns):\n", + "mpg 398 non-null float64\n", + "cylinders 398 non-null int64\n", + "displacement 398 non-null float64\n", + "horse_power 392 non-null float64\n", + "weight 398 non-null int64\n", + "acceleration 398 non-null float64\n", + "model_year 398 non-null int64\n", + "car_name 398 non-null object\n", + "dtypes: float64(4), int64(3), object(1)\n", + "memory usage: 25.0+ KB\n" + ] + } + ], + "source": [ + "# Your code here:\n", + "auto.info()" ] }, { @@ -399,11 +681,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Newest: 70\n", + "Oldest: 82\n" + ] + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "print(\"Newest: \", auto.sort_values('model_year',ascending=True).model_year.to_list()[0])\n", + "print(\"Oldest: \", auto.sort_values('model_year',ascending=False).model_year.to_list()[0])" ] }, { @@ -415,11 +708,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mpg 0\n", + "cylinders 0\n", + "displacement 0\n", + "horse_power 6\n", + "weight 0\n", + "acceleration 0\n", + "model_year 0\n", + "car_name 0\n", + "dtype: int64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "auto.dropna(axis=0, inplace=True)" ] }, { @@ -431,11 +753,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here:\n" + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 199\n", + "8 103\n", + "6 83\n", + "3 4\n", + "5 3\n", + "Name: cylinders, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Your code here:\n", + "auto.cylinders.value_counts()" ] }, { @@ -451,11 +790,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto.drop('car_name',axis=1,inplace=True)\n", + "X = auto.drop('mpg', axis=1)\n", + "y = auto['mpg'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)" ] }, { @@ -469,11 +821,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model = LinearRegression()\n", + "auto_model.fit(X_train,y_train)" ] }, { @@ -502,11 +867,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8128121419212276" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "from sklearn.metrics import r2_score\n", + "y_pred = auto_model.predict(X_train)\n", + "r2_score(y_train, y_pred)" ] }, { @@ -522,11 +901,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7879602859023379" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_test_pred = auto_model.predict(X_test)\n", + "r2_score(y_test,y_test_pred)" ] }, { @@ -551,11 +943,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "X_train09, X_test09, y_train09, y_test09 = train_test_split(X, y, train_size=0.9)" ] }, { @@ -567,11 +960,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "auto_model09=LinearRegression()\n", + "auto_model09.fit(X_train09,y_train09)" ] }, { @@ -583,11 +989,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8091633121354688" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred_train09 = auto_model09.predict(X_train09)\n", + "r2_score(y_train09,y_pred_train09)" ] }, { @@ -599,11 +1018,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.8060733385715728" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here:\n" + "# Your code here:\n", + "y_pred_test09 = auto_model09.predict(X_test09)\n", + "r2_score(y_test09,y_pred_test09)" ] }, { @@ -712,9 +1144,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [conda env:dataenv]", "language": "python", - "name": "python3" + "name": "conda-env-dataenv-py" }, "language_info": { "codemirror_mode": { @@ -726,7 +1158,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.7.5" } }, "nbformat": 4,