Machine-Learning/RF at main · Bala123a/Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import pandas as pd
import numpy as np
import rasterio
!pip install rioxarray
import rioxarray
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xarray as xr # Import xarray with the alias xr

# -------------------------
# 1. Load individual raster predictors (Landsat bands)
# -------------------------
# Define paths to individual band files (adjust these paths/names as needed)
raster_files = [

]

# Band names for interpretation (adjust to match your bands, e.g., ["Blue", "Green", "Red", "NIR", "SWIR"])
band_names = [f"Band_{i+1}" for i in range(len(raster_files))]

# Load rasters and check consistency
rasters = []
meta = None
for file in raster_files:
    with rasterio.open(file) as src:
        if meta is None:
            meta = src.meta.copy()  # Use first raster's metadata for output
        rasters.append(rioxarray.open_rasterio(file))

# Verify all rasters have the same shape and CRS
first_raster = rasters[0]
for i, raster in enumerate(rasters[1:], 1):
    if raster.shape != first_raster.shape or raster.rio.crs != first_raster.rio.crs:
        raise ValueError(f"Raster {raster_files[i]} does not match shape or CRS of {raster_files[0]}")

# Stack rasters into a single xarray (rows, cols, bands)
stacked = xr.concat(rasters, dim="band").transpose("y", "x", "band")

# -------------------------
# 2. Load training points
# -------------------------
points = pd.read_csv("")

# Convert to GeoDataFrame
points = gpd.GeoDataFrame(
    points,
    geometry=gpd.points_from_xy(points.longitude, points.latitude),
    crs="EPSG:4326"
)
points = points.to_crs(stacked.rio.crs)  # match CRS

# Extract raster values at training points
coords = [(x, y) for x, y in zip(points.geometry.x, points.geometry.y)]
samples = [list(stacked.sel(x=x, y=y, method="nearest").values) for x, y in coords]

X = np.array(samples)  # features
y = points["Forest_fir"].values  # labels (1=fire, 0=no fire)

# Remove NaN values
mask_valid = ~np.isnan(X).any(axis=1)
X, y = X[mask_valid], y[mask_valid]

print(f"Valid samples: {X.shape[0]}")

# -------------------------
# 3. Split data into training (70%) and testing (30%)
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training samples: {X_train.shape[0]} (70%)")
print(f"Testing samples: {X_test.shape[0]} (30%)")

# -------------------------
# 4. LASSO for feature selection
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)
lasso.fit(X_train_scaled, y_train)

selected_features = np.where(lasso.coef_[0] != 0)[0]
print(f"Selected features (indices): {selected_features}")

if len(selected_features) == 0:
    print("No features selected by LASSO. Using all features.")
    selected_features = np.arange(X_train.shape[1])

X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]
selected_band_names = [band_names[i] for i in selected_features]

# -------------------------
# 5. SMOTE for oversampling
# -------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_selected, y_train)

print(f"Resampled training samples: {X_train_res.shape[0]}")

# -------------------------
# 6. Pearson correlation matrix
# -------------------------
df_corr = pd.DataFrame(X_train_res, columns=selected_band_names)
df_corr["Forest_fir"] = y_train_res
corr_matrix = df_corr.corr(method="pearson")

print("\nPearson Correlation Matrix (Resampled Training Data):")
print(corr_matrix.round(3))

corr_matrix.to_csv("pearson_correlation_matrix.csv")
print("Correlation matrix saved to 'pearson_correlation_matrix.csv'")

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Pearson Correlation (Resampled Training Data)")
plt.tight_layout()
plt.show()

# -------------------------
# 7. Hyperparameter tuning for Random Forest
# -------------------------
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

best_rf = grid_search.best_estimator_
print(f"\nBest hyperparameters: {grid_search.best_params_}")

# -------------------------
# 8. Feature importance
# -------------------------
importances = best_rf.feature_importances_
print("\nFeature Importances:")
for name, imp in zip(selected_band_names, importances):
    print(f"{name}: {imp:.4f}")

plt.figure(figsize=(8, 5))
plt.bar(selected_band_names, importances, color="skyblue")
plt.title("Random Forest Feature Importance")
plt.ylabel("Importance")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -------------------------
# 9. Evaluate on test data
# -------------------------
y_pred = best_rf.predict(X_test_selected)
y_prob = best_rf.predict_proba(X_test_selected)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

print(f"\nTest Set Metrics:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"AUC-ROC: {roc_auc:.3f}")

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Random Forest ROC Curve (Test Set)")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# -------------------------
# 10. Apply to raster stack
# -------------------------
rows, cols, bands = stacked.shape
X_all = stacked.values.reshape(-1, bands)
X_all_selected = X_all[:, selected_features]  # Apply feature selection

probs = best_rf.predict_proba(X_all_selected)[:, 1]
susceptibility = probs.reshape(rows, cols)

# -------------------------
# 11. Save susceptibility map
# -------------------------
out_meta = meta.copy()
out_meta.update({
    "driver": "GTiff",
    "height": rows,
    "width": cols,
    "count": 1,
    "dtype": "float32",
    "compress": "lzw"
})

with rasterio.open("forest_fire_susceptibility.tif", "w", **out_meta) as dst:
    dst.write(susceptibility.astype("float32"), 1)

print("Susceptibility map saved as 'forest_fire_susceptibility.tif'")