UnsupervisedLearningPortfolio/principalComponentAnalysis_IrishDataset.py at main · Quantx73/UnsupervisedLearningPortfolio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#loading libraries
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import eig
from sklearn import datasets

#loading dataset
iris = datasets.load_iris()

#building PCA Algorithms
class PrincipalComponentAnalysis:
    def __init__(self, num_components=None):
        self.num_components = num_components
        self.mean = None
        self.std = None
        self.cov_matrix = None
        self.eigenvectors = None
        self.explained_variance_ratio = None

    def fit(self, X):
        #standardise features
        self.mean = np.mean(X, axis=0)
        self.std = np.std(X, axis=0)
        if np.any(self.std == 0):
            raise ValueError("One or more features have zero variance.")
        X_std = (X - self.mean) / self.std

        #covariance matrix
        self.cov_matrix = np.cov(X_std.T)
        #eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(self.cov_matrix)
        #eigenvalues and eigenvectors are real
        eigenvalues = eigenvalues.real
        eigenvectors = eigenvectors.real
        #sort eigenvalues and eigenvectors
        sorted_indices = np.argsort(eigenvalues)[::-1]
        self.eigenvalues = eigenvalues[sorted_indices]
        self.eigenvectors = eigenvectors[:, sorted_indices]
        #variance ratio
        total_variance = np.sum(self.eigenvalues)
        self.explained_variance_ratio = self.eigenvalues / total_variance

    def transform(self, X, n_components=None):
        if n_components is None:
            n_components = self.num_components if self.num_components is not None else X.shape[1]
        if n_components > X.shape[1]:
            raise ValueError("Number of components cannot be greater than the number of features.")
        #standardise data based on mean and std from fitting
        X_std = (X - self.mean) / self.std
        #subset of eigenvectors
        eigenvector_subset = self.eigenvectors[:, :n_components]
        #transform the data
        return X_std @ eigenvector_subset

    def get_explained_variance_ratio(self):
        return self.explained_variance_ratio

    def get_covariance(self):
        return self.cov_matrix

#feature and target selection
X = iris.data
y = iris.target

#training model
pca = PrincipalComponentAnalysis(num_components=2)
pca.fit(X)
X_pca = pca.transform(X)

#original plot and transformed plot
plt.figure(figsize=(10, 4))
colors = ['navy', 'turquoise', 'darkorange']
plt.subplot(1, 2, 1)
for i, color in zip(range(len(np.unique(y))), colors):
    subset = X[y == i]
    plt.scatter(subset[:, 0], subset[:, 1], label=iris.target_names[i], color=color, s=20)
plt.title("Original Iris Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.grid(True, ls='--', alpha=0.5)
plt.legend()

plt.subplot(1, 2, 2)
for i, color in zip(range(len(np.unique(y))), colors):
    subset = X_pca[y == i]
    plt.scatter(subset[:, 0], subset[:, 1], label=iris.target_names[i], color=color, s=20)
plt.title("Iris Data After PCA")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True, ls='--', alpha=0.5)
plt.legend()

plt.tight_layout()
plt.show()

#plot cummulative variance
cumulative_variance = np.cumsum(pca.get_explained_variance_ratio())
plt.figure(figsize=(8, 4))
plt.plot(cumulative_variance, marker='o')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True, ls='--', alpha=0.5)
plt.show()