Python_ML_Algorithms/bayesian_classifier.py at main · GeethikaMaddi/Python_ML_Algorithms · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

# # Load dataset
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
# newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers',
# 'footers', 'quotes'))
# # Convert text data into feature vectors
# count_vect = CountVectorizer()
# X_counts = count_vect.fit_transform(newsgroups.data)
# tfidf_transformer = TfidfTransformer()
# X_tfidf = tfidf_transformer.fit_transform(X_counts)
# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, newsgroups.target, test_size=0.2,
# random_state=42)
# # Train Naïve Bayes Classifier
# nb_classifier = MultinomialNB()
# nb_classifier.fit(X_train, y_train)
# # Make predictions
# y_pred = nb_classifier.predict(X_test)
# # Calculate metrics
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# # Print results
# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")


# Sample dataset
data = {
    'text': [
        "Free entry in 2 a wkly comp to win FA Cup final tkts",
        "Nah I don't think he goes to usf, he lives around here",
        "WINNER!! As a valued network customer you have won a prize",
        "I'll call you later, don't worry",
        "URGENT! You have won a 1 week FREE membership"
    ],
    'label': [1, 0, 1, 0, 1]  # 1 = Spam, 0 = Not spam
}
df = pd.DataFrame(data)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Text vectorization
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)

tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_cv)

# Train model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# Test data transform and predict
X_test_tfidf = tfidf.transform(cv.transform(X_test))
y_pred = nb.predict(X_test_tfidf)

# Evaluation
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print("Accuracy: %.4f" % acc)
print("Precision: %.4f" % prec)
print("Recall: %.4f" % rec)