Datamining_Project/dataprep.py at master · jcg1183/Datamining_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import math
import time

from objects import dataset
import settings

import numpy as np
import pandas as pd
from sklearn import datasets


# ***************************************************************
# Function:         ready_datasets
# Variables/input:  argparse.arguments object
# Output:           python list containing objests.dataset
# Usage/Purpose:    Function loads a dataset from a csv file
#                   or generates datasets using sklearn.dataset.
#                   CSV datasets are specified by command line
#                   argument and sklearn datasets are specified
#                   in settings.py.
# ***************************************************************
def ready_datasets(args):
    datasetReturn = []

    # load dataset from csv.  this has not been tested
    # the csv part could be abstracted into another function
    if args.dataset:
        dfCSV = pd.read_csv(args.dataset)
        datasetReturn.append(dataset(args.dataset[2:-4], dfCSV))

        settings.datasetTypes.insert(0, args.dataset[2:-4])

        print("dataset read from {0}".format(args.dataset[2:-4]))
        print(dfCSV.head(5))

    # loop through all sklearn dataset types and add a new
    # dataset object to the return list
    if args.generate:
        for name in settings.datasetTypes:
            datasetReturn.append(dataset(name, build_dataset(name)))

    return datasetReturn


# ***************************************************************
# Function:         build_dataset
# Variables/input:  string: name
# Output:           pandas dataframe
# Usage/Purpose:    Function builds and returns specified
#                   dataset.
# ***************************************************************
def build_dataset(name):
    # build all the dataset types here
    df = pd.DataFrame()

    # generate sklearn datasets
    if name == "circles":
        new_dataset = datasets.make_circles(
            n_samples=settings.maxSamples, factor=0.5, noise=0.05
        )

    elif name == "moons":
        new_dataset = datasets.make_moons(n_samples=settings.maxSamples, noise=0.05)

    elif name == "blobs":
        new_dataset = datasets.make_blobs(n_samples=settings.maxSamples, random_state=1)

    # random needs updating
    elif name == "random":
        # Fitting a pentagon in a square hole, needs updating asap
        random = np.random.uniform(low=0.0, high=15.0, size=(200, 2))
        df = pd.DataFrame(columns=["x1", "x2"])
        df.x1 = random[:, 0]
        df.x2 = random[:, 1]
        return df

    # convert to dataframe
    df = pd.DataFrame(new_dataset[0], columns=["x1", "x2"])

    # add cluster labels to dataframe
    df["y"] = new_dataset[1]

    return df


# ***************************************************************
# Function:         calculate_distances
# Variables/input:  objects.exp
# Output:           appends distance matrix to dataset
# Usage/Purpose:    Function takes a dataset and creates a
#                   distance matrix for that dataset.
# ***************************************************************
def calculate_distances(exp):
    print("calculate_distances")

    for ds in exp.datasets:
        df = ds.df

        getDistancesTimeStart = time.perf_counter()

        ds.distanceArray = np.zeros(
            [settings.maxSamples, settings.maxSamples], dtype=float
        )

        for i in range(settings.maxSamples):
            for j in range(i, settings.maxSamples):
                if i == j:
                    continue

                distance = math.sqrt(
                    math.pow(df.iloc[i]["x1"] - df.iloc[j]["x1"], 2)
                    + math.pow(df.iloc[i]["x2"] - df.iloc[j]["x2"], 2)
                )

                ds.distanceArray[i, j] = distance
                ds.distanceArray[j, i] = distance

        getDistancesTimeStop = time.perf_counter()

        print(
            "{0} get_distances time: {1:5.4}".format(
                ds.name, (getDistancesTimeStop - getDistancesTimeStart) * 100
            )
        )