-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_training.py
More file actions
76 lines (61 loc) · 2.38 KB
/
model_training.py
File metadata and controls
76 lines (61 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import pickle
def load_and_clean_data():
# Load the dataset (Update the file path accordingly)
data_fake = pd.read_csv('Fake.csv', encoding='latin1')
data_true = pd.read_csv('True.csv', encoding='latin1')
# Assign class labels (0 = Fake, 1 = True)
data_fake['class'] = 0
data_true['class'] = 1
# Combine datasets
data = pd.concat([data_fake, data_true], axis=0).reset_index(drop=True)
# Drop missing values
data.dropna(inplace=True)
# Select only relevant columns (Modify based on dataset structure)
if 'text' not in data.columns:
raise ValueError("The dataset must contain a 'text' column.")
return data
def preprocess_text(data):
# Convert text to lowercase and remove unwanted spaces
data['text'] = data['text'].str.lower().str.strip()
return data
def train_models(data):
# Split data
x = data['text']
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.23, random_state=42)
# Text Vectorization
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)
# Train models
models = {
'logistic_regression': LogisticRegression(),
'decision_tree': DecisionTreeClassifier(),
'random_forest': RandomForestClassifier(random_state=0)
}
for model_name, model in models.items():
model.fit(xv_train, y_train)
pred = model.predict(xv_test)
print(f"Model: {model_name.replace('_', ' ').title()}")
print(classification_report(y_test, pred))
# Save model
with open(f'{model_name}_model.pkl', 'wb') as f:
pickle.dump(model, f)
print(f"File created: {model_name}_model.pkl")
# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
pickle.dump(vectorization, f)
print("File created: vectorizer.pkl")
if __name__ == "__main__":
# Load and preprocess data
data = load_and_clean_data()
data = preprocess_text(data)
# Train models
train_models(data)