From ca859ddc14a3bb03aa1d848a9f85e968869ba00d Mon Sep 17 00:00:00 2001 From: Burak Yildiz <145865827+burakkyildiz@users.noreply.github.com> Date: Sun, 19 Oct 2025 16:49:20 +0300 Subject: [PATCH] fix variable name mismatch in remove_outliers_from_column Replaced undefined variable 'col' with 'target_col' to prevent NameError. --- .../21-XGBoostRegressor-checkpoint.ipynb | 3023 +++++++++++++++++ 21-XGBoostRegressor.ipynb | 10 +- 2 files changed, 3028 insertions(+), 5 deletions(-) create mode 100644 .ipynb_checkpoints/21-XGBoostRegressor-checkpoint.ipynb diff --git a/.ipynb_checkpoints/21-XGBoostRegressor-checkpoint.ipynb b/.ipynb_checkpoints/21-XGBoostRegressor-checkpoint.ipynb new file mode 100644 index 0000000..52163c9 --- /dev/null +++ b/.ipynb_checkpoints/21-XGBoostRegressor-checkpoint.ipynb @@ -0,0 +1,3023 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0f15ab0a-d8c0-4ab7-a788-05ae26f8a949", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2ccf395c-7bb0-420a-8ada-d89fbcb81c03", + "metadata": {}, + "outputs": [], + "source": [ + "#https://www.kaggle.com/datasets/camnugent/california-housing-prices/data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9f487aff-ee9a-4d8a-b842-6ab8aa27d0bd", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"21-housing.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6e520191-b1ac-468b-8b5f-7694a34a729b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "ocean_proximity | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-122.23 | \n", + "37.88 | \n", + "41.0 | \n", + "880.0 | \n", + "129.0 | \n", + "322.0 | \n", + "126.0 | \n", + "8.3252 | \n", + "452600.0 | \n", + "NEAR BAY | \n", + "
| 1 | \n", + "-122.22 | \n", + "37.86 | \n", + "21.0 | \n", + "7099.0 | \n", + "1106.0 | \n", + "2401.0 | \n", + "1138.0 | \n", + "8.3014 | \n", + "358500.0 | \n", + "NEAR BAY | \n", + "
| 2 | \n", + "-122.24 | \n", + "37.85 | \n", + "52.0 | \n", + "1467.0 | \n", + "190.0 | \n", + "496.0 | \n", + "177.0 | \n", + "7.2574 | \n", + "352100.0 | \n", + "NEAR BAY | \n", + "
| 3 | \n", + "-122.25 | \n", + "37.85 | \n", + "52.0 | \n", + "1274.0 | \n", + "235.0 | \n", + "558.0 | \n", + "219.0 | \n", + "5.6431 | \n", + "341300.0 | \n", + "NEAR BAY | \n", + "
| 4 | \n", + "-122.25 | \n", + "37.85 | \n", + "52.0 | \n", + "1627.0 | \n", + "280.0 | \n", + "565.0 | \n", + "259.0 | \n", + "3.8462 | \n", + "342200.0 | \n", + "NEAR BAY | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| count | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20433.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "20640.000000 | \n", + "
| mean | \n", + "-119.569704 | \n", + "35.631861 | \n", + "28.639486 | \n", + "2635.763081 | \n", + "537.870553 | \n", + "1425.476744 | \n", + "499.539680 | \n", + "3.870671 | \n", + "206855.816909 | \n", + "
| std | \n", + "2.003532 | \n", + "2.135952 | \n", + "12.585558 | \n", + "2181.615252 | \n", + "421.385070 | \n", + "1132.462122 | \n", + "382.329753 | \n", + "1.899822 | \n", + "115395.615874 | \n", + "
| min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "1.000000 | \n", + "3.000000 | \n", + "1.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
| 25% | \n", + "-121.800000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1447.750000 | \n", + "296.000000 | \n", + "787.000000 | \n", + "280.000000 | \n", + "2.563400 | \n", + "119600.000000 | \n", + "
| 50% | \n", + "-118.490000 | \n", + "34.260000 | \n", + "29.000000 | \n", + "2127.000000 | \n", + "435.000000 | \n", + "1166.000000 | \n", + "409.000000 | \n", + "3.534800 | \n", + "179700.000000 | \n", + "
| 75% | \n", + "-118.010000 | \n", + "37.710000 | \n", + "37.000000 | \n", + "3148.000000 | \n", + "647.000000 | \n", + "1725.000000 | \n", + "605.000000 | \n", + "4.743250 | \n", + "264725.000000 | \n", + "
| max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "500001.000000 | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| longitude | \n", + "1.000000 | \n", + "-0.924664 | \n", + "-0.108197 | \n", + "0.044568 | \n", + "0.069608 | \n", + "0.099773 | \n", + "0.055310 | \n", + "-0.015176 | \n", + "-0.045967 | \n", + "
| latitude | \n", + "-0.924664 | \n", + "1.000000 | \n", + "0.011173 | \n", + "-0.036100 | \n", + "-0.066983 | \n", + "-0.108785 | \n", + "-0.071035 | \n", + "-0.079809 | \n", + "-0.144160 | \n", + "
| housing_median_age | \n", + "-0.108197 | \n", + "0.011173 | \n", + "1.000000 | \n", + "-0.361262 | \n", + "-0.320451 | \n", + "-0.296244 | \n", + "-0.302916 | \n", + "-0.119034 | \n", + "0.105623 | \n", + "
| total_rooms | \n", + "0.044568 | \n", + "-0.036100 | \n", + "-0.361262 | \n", + "1.000000 | \n", + "0.930380 | \n", + "0.857126 | \n", + "0.918484 | \n", + "0.198050 | \n", + "0.134153 | \n", + "
| total_bedrooms | \n", + "0.069608 | \n", + "-0.066983 | \n", + "-0.320451 | \n", + "0.930380 | \n", + "1.000000 | \n", + "0.877747 | \n", + "0.979728 | \n", + "-0.007723 | \n", + "0.049686 | \n", + "
| population | \n", + "0.099773 | \n", + "-0.108785 | \n", + "-0.296244 | \n", + "0.857126 | \n", + "0.877747 | \n", + "1.000000 | \n", + "0.907222 | \n", + "0.004834 | \n", + "-0.024650 | \n", + "
| households | \n", + "0.055310 | \n", + "-0.071035 | \n", + "-0.302916 | \n", + "0.918484 | \n", + "0.979728 | \n", + "0.907222 | \n", + "1.000000 | \n", + "0.013033 | \n", + "0.065843 | \n", + "
| median_income | \n", + "-0.015176 | \n", + "-0.079809 | \n", + "-0.119034 | \n", + "0.198050 | \n", + "-0.007723 | \n", + "0.004834 | \n", + "0.013033 | \n", + "1.000000 | \n", + "0.688075 | \n", + "
| median_house_value | \n", + "-0.045967 | \n", + "-0.144160 | \n", + "0.105623 | \n", + "0.134153 | \n", + "0.049686 | \n", + "-0.024650 | \n", + "0.065843 | \n", + "0.688075 | \n", + "1.000000 | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| outlier_count | \n", + "0.000 | \n", + "0.00 | \n", + "0.0 | \n", + "1287.000000 | \n", + "1271.000000 | \n", + "1196.000000 | \n", + "1220.000000 | \n", + "681.000000 | \n", + "1071.000000 | \n", + "
| outlier_percentage | \n", + "0.000 | \n", + "0.00 | \n", + "0.0 | \n", + "6.235465 | \n", + "6.157946 | \n", + "5.794574 | \n", + "5.910853 | \n", + "3.299419 | \n", + "5.188953 | \n", + "
| lower_bound | \n", + "-127.485 | \n", + "28.26 | \n", + "-10.5 | \n", + "-1102.625000 | \n", + "-230.500000 | \n", + "-620.000000 | \n", + "-207.500000 | \n", + "-0.706375 | \n", + "-98087.500000 | \n", + "
| upper_bound | \n", + "-112.325 | \n", + "43.38 | \n", + "65.5 | \n", + "5698.375000 | \n", + "1173.500000 | \n", + "3132.000000 | \n", + "1092.500000 | \n", + "8.013025 | \n", + "482412.500000 | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| count | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19369.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "
| mean | \n", + "-119.562786 | \n", + "35.654159 | \n", + "28.352752 | \n", + "2619.977260 | \n", + "539.893335 | \n", + "1442.788952 | \n", + "501.394859 | \n", + "3.665568 | \n", + "190852.301906 | \n", + "
| std | \n", + "2.005764 | \n", + "2.151007 | \n", + "12.497772 | \n", + "2183.419302 | \n", + "422.650225 | \n", + "1145.011369 | \n", + "383.396308 | \n", + "1.557927 | \n", + "95438.555669 | \n", + "
| min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "2.000000 | \n", + "3.000000 | \n", + "2.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
| 25% | \n", + "-121.760000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1438.000000 | \n", + "297.000000 | \n", + "797.000000 | \n", + "282.000000 | \n", + "2.522700 | \n", + "116200.000000 | \n", + "
| 50% | \n", + "-118.510000 | \n", + "34.270000 | \n", + "28.000000 | \n", + "2110.000000 | \n", + "437.000000 | \n", + "1181.000000 | \n", + "411.000000 | \n", + "3.441200 | \n", + "173200.000000 | \n", + "
| 75% | \n", + "-117.990000 | \n", + "37.730000 | \n", + "37.000000 | \n", + "3123.000000 | \n", + "648.000000 | \n", + "1749.000000 | \n", + "606.000000 | \n", + "4.572100 | \n", + "246700.000000 | \n", + "
| max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "482200.000000 | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| count | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "19569.000000 | \n", + "
| mean | \n", + "-119.562786 | \n", + "35.654159 | \n", + "28.352752 | \n", + "2619.977260 | \n", + "538.841739 | \n", + "1442.788952 | \n", + "501.394859 | \n", + "3.665568 | \n", + "190852.301906 | \n", + "
| std | \n", + "2.005764 | \n", + "2.151007 | \n", + "12.497772 | \n", + "2183.419302 | \n", + "420.612109 | \n", + "1145.011369 | \n", + "383.396308 | \n", + "1.557927 | \n", + "95438.555669 | \n", + "
| min | \n", + "-124.350000 | \n", + "32.540000 | \n", + "1.000000 | \n", + "2.000000 | \n", + "2.000000 | \n", + "3.000000 | \n", + "2.000000 | \n", + "0.499900 | \n", + "14999.000000 | \n", + "
| 25% | \n", + "-121.760000 | \n", + "33.930000 | \n", + "18.000000 | \n", + "1438.000000 | \n", + "299.000000 | \n", + "797.000000 | \n", + "282.000000 | \n", + "2.522700 | \n", + "116200.000000 | \n", + "
| 50% | \n", + "-118.510000 | \n", + "34.270000 | \n", + "28.000000 | \n", + "2110.000000 | \n", + "437.000000 | \n", + "1181.000000 | \n", + "411.000000 | \n", + "3.441200 | \n", + "173200.000000 | \n", + "
| 75% | \n", + "-117.990000 | \n", + "37.730000 | \n", + "37.000000 | \n", + "3123.000000 | \n", + "645.000000 | \n", + "1749.000000 | \n", + "606.000000 | \n", + "4.572100 | \n", + "246700.000000 | \n", + "
| max | \n", + "-114.310000 | \n", + "41.950000 | \n", + "52.000000 | \n", + "39320.000000 | \n", + "6445.000000 | \n", + "35682.000000 | \n", + "6082.000000 | \n", + "15.000100 | \n", + "482200.000000 | \n", + "
| \n", + " | longitude | \n", + "latitude | \n", + "housing_median_age | \n", + "total_rooms | \n", + "total_bedrooms | \n", + "population | \n", + "households | \n", + "median_income | \n", + "median_house_value | \n", + "ocean_proximity_INLAND | \n", + "ocean_proximity_ISLAND | \n", + "ocean_proximity_NEAR BAY | \n", + "ocean_proximity_NEAR OCEAN | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-122.23 | \n", + "37.88 | \n", + "41.0 | \n", + "880.0 | \n", + "129.0 | \n", + "322.0 | \n", + "126.0 | \n", + "8.3252 | \n", + "452600.0 | \n", + "False | \n", + "False | \n", + "True | \n", + "False | \n", + "
| 1 | \n", + "-122.22 | \n", + "37.86 | \n", + "21.0 | \n", + "7099.0 | \n", + "1106.0 | \n", + "2401.0 | \n", + "1138.0 | \n", + "8.3014 | \n", + "358500.0 | \n", + "False | \n", + "False | \n", + "True | \n", + "False | \n", + "
| 2 | \n", + "-122.24 | \n", + "37.85 | \n", + "52.0 | \n", + "1467.0 | \n", + "190.0 | \n", + "496.0 | \n", + "177.0 | \n", + "7.2574 | \n", + "352100.0 | \n", + "False | \n", + "False | \n", + "True | \n", + "False | \n", + "
| 3 | \n", + "-122.25 | \n", + "37.85 | \n", + "52.0 | \n", + "1274.0 | \n", + "235.0 | \n", + "558.0 | \n", + "219.0 | \n", + "5.6431 | \n", + "341300.0 | \n", + "False | \n", + "False | \n", + "True | \n", + "False | \n", + "
| 4 | \n", + "-122.25 | \n", + "37.85 | \n", + "52.0 | \n", + "1627.0 | \n", + "280.0 | \n", + "565.0 | \n", + "259.0 | \n", + "3.8462 | \n", + "342200.0 | \n", + "False | \n", + "False | \n", + "True | \n", + "False | \n", + "
RandomizedSearchCV(cv=5,\n",
+ " estimator=XGBRegressor(base_score=None, booster=None,\n",
+ " callbacks=None,\n",
+ " colsample_bylevel=None,\n",
+ " colsample_bynode=None,\n",
+ " colsample_bytree=None, device=None,\n",
+ " early_stopping_rounds=None,\n",
+ " enable_categorical=False,\n",
+ " eval_metric=None, feature_types=None,\n",
+ " feature_weights=None, gamma=None,\n",
+ " grow_policy=None,\n",
+ " importance_type=None,\n",
+ " interaction_constraint...\n",
+ " max_cat_to_onehot=None,\n",
+ " max_delta_step=None, max_depth=None,\n",
+ " max_leaves=None,\n",
+ " min_child_weight=None, missing=nan,\n",
+ " monotone_constraints=None,\n",
+ " multi_strategy=None,\n",
+ " n_estimators=None, n_jobs=None,\n",
+ " num_parallel_tree=None, ...),\n",
+ " n_jobs=-1,\n",
+ " param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.7,\n",
+ " 1],\n",
+ " 'learning_rate': [0.1, 0.01],\n",
+ " 'max_depth': [5, 8, 12, 20, 30],\n",
+ " 'n_estimators': [100, 200, 300, 500]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5,\n",
+ " estimator=XGBRegressor(base_score=None, booster=None,\n",
+ " callbacks=None,\n",
+ " colsample_bylevel=None,\n",
+ " colsample_bynode=None,\n",
+ " colsample_bytree=None, device=None,\n",
+ " early_stopping_rounds=None,\n",
+ " enable_categorical=False,\n",
+ " eval_metric=None, feature_types=None,\n",
+ " feature_weights=None, gamma=None,\n",
+ " grow_policy=None,\n",
+ " importance_type=None,\n",
+ " interaction_constraint...\n",
+ " max_cat_to_onehot=None,\n",
+ " max_delta_step=None, max_depth=None,\n",
+ " max_leaves=None,\n",
+ " min_child_weight=None, missing=nan,\n",
+ " monotone_constraints=None,\n",
+ " multi_strategy=None,\n",
+ " n_estimators=None, n_jobs=None,\n",
+ " num_parallel_tree=None, ...),\n",
+ " n_jobs=-1,\n",
+ " param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.7,\n",
+ " 1],\n",
+ " 'learning_rate': [0.1, 0.01],\n",
+ " 'max_depth': [5, 8, 12, 20, 30],\n",
+ " 'n_estimators': [100, 200, 300, 500]})XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.1, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=20,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=300,\n", + " n_jobs=None, num_parallel_tree=None, ...)
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.1, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=20,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=300,\n", + " n_jobs=None, num_parallel_tree=None, ...)
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.1, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=6,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=300,\n", + " n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " feature_weights=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.1, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=6,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=300,\n", + " n_jobs=None, num_parallel_tree=None, ...)