Data-visualizer/Modified_LLE.py at main · iremozturk/Data-visualizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# coding: utf-8

# In[5]:


import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import offsetbox
import numpy as np
from time import time
from itertools import zip_longest

def runModifiedLLE():
    data_set = pd.read_csv('data_as_csv.csv') #turned data's file to csv file
    data_set.info() #get general information about dataset
    #from sklearn.cross_validation import train_test_split
    # Use all numeric columns except the last one for X, last column for y
    numeric_cols = data_set.select_dtypes(include=['number']).columns.tolist()

    if len(numeric_cols) >= 2:
        # Use all numeric columns except the last one for features
        X = data_set[numeric_cols[:-1]]
        # Use last numeric column for target (or last column if no numeric)
        y = data_set[numeric_cols[-1]] if len(numeric_cols) > 1 else data_set.iloc[:, -1]
    else:
        # Fallback: use all columns except last for X, last for y
        X = data_set.iloc[:, :-1]
        y = data_set.iloc[:, -1]
    #n_samples is 1500 because of the dataset's length, n_neighbors will consider 30 neighbors for each point
    n_samples = 1500
    n_features = 64
    n_neighbors = 30
    print(type(X))

    #the LLE methods that we used reqires numpy.ndarray so we convert
    newdata=y.to_numpy()
    newX=X.to_numpy()
    newdata = newX.copy()
    #newdata = np.reshape(newdata, (-1, 2))
    newdata1=data_set.to_numpy()


    import numpy as np
    from matplotlib import offsetbox
    from sklearn.preprocessing import MinMaxScaler

    def helper_function(X, title, ax):
    # from sklearn.preprocessing import MinMaxScaler
        X = MinMaxScaler().fit_transform(X)

        # Plot scatter points with colors - much more readable
        if len(y) > 0:
            # Use unique colors for different y values if y is categorical
            try:
                unique_y = np.unique(y)
                if len(unique_y) <= 10:
                    # Categorical data - use distinct colors
                    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_y)))
                    for idx, uy in enumerate(unique_y):
                        mask = y == uy
                        ax.scatter(X[mask, 0], X[mask, 1], c=[colors[idx]], s=20, alpha=0.6, label=f'Class {uy}')
                else:
                    # Continuous data - use colormap
                    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', s=20, alpha=0.6)
                    plt.colorbar(scatter, ax=ax)
            except:
                # Fallback: simple scatter
                ax.scatter(X[:, 0], X[:, 1], s=20, alpha=0.6, c='blue')
        else:
            ax.scatter(X[:, 0], X[:, 1], s=20, alpha=0.6, c='blue')

        # Only show labels for a small subset of points (max 20 labels)
        n_labels = min(20, X.shape[0])
        if n_labels > 0:
            # Sample evenly spaced indices
            label_indices = np.linspace(0, X.shape[0]-1, n_labels, dtype=int)
            for i in label_indices:
                ax.text(X[i, 0], X[i, 1], str(i), fontsize=7, alpha=0.7,
                       bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.7))

        ax.set_title(title, fontsize=12, fontweight='bold')
        ax.grid(True, alpha=0.3)
        ax.set_xlabel('Component 1', fontsize=10)
        ax.set_ylabel('Component 2', fontsize=10)


    # In[7]:


    from sklearn.manifold import LocallyLinearEmbedding

    embeddings = {

        "Modified LLE": LocallyLinearEmbedding(
            n_neighbors=n_neighbors, n_components=2, method="modified" #to let helper function know that we called it for standard LLE method
        )
    }


    # In[10]:


    projections, timing = {}, {}
    for name, transformer in embeddings.items():

        if name.startswith("Modified LLE"):
            newdata=X.to_numpy()
            newX=y.to_numpy()
            newdata = newX.copy()
            newdata = np.reshape(newdata, (-1, 2))
            newdata=newdata
            newX = np.reshape(newX, (-1, 2))
            newdata.flatten() #that was required by functions that we used for calculating time
        # newdata.flat[:: newX.shape[1] + 1] += 0.01  # Make X invertible
        #print(newdata)
        #print(newX)


        else:
            newdata = newX
        print(f"{name}...")
        print(type(newdata))

        start_time = time()


        projections[name] = transformer.fit_transform(newdata, y)
        timing[name] = time() - start_time #time is calculated


    # In[11]:


    from itertools import zip_longest

    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(17, 24)) #creating figures
    #print(type(ax0))

    for name, ax0 in zip_longest(timing, axs.ravel()): #check the timing for each LLE method
        if name is None:
            ax0.axis("off")
            continue
        title = f"{name} (time {timing[name]:.3f}s)" #wrote methods name and its running time
        helper_function(projections[name], title, ax0) #calling helper function for each methods

    plt.show()


    # In[ ]: