-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProbClustDataGeneration.R
More file actions
172 lines (147 loc) · 9.13 KB
/
ProbClustDataGeneration.R
File metadata and controls
172 lines (147 loc) · 9.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
###########################################################################################################
#
# Cite this code:
#
# Ernst, A. F. (2017). Probabilistic Time Series Clustering by Vector Autoregressive Metric - R code.
# https://github.com/AnieBee/MasterThesis. GitHub.
#
#
########################################################################################################
library(portes) #varima.sim()
library(MASS) #mvrnorm()
set.seed(705)
nPhikqGenerated <- 0
nPhikqUsed <- 0
Phi_ks <- array(1, dim = c(numberVaribales, numberVaribales,
length(nPersons), # index:i
length(nClusters), # index:j
length(ClusterSize), # index: k
length(Observations), #index: l
length(Distance), # index: m
max(nClusters), #index n
Replications #index r
))
dims = (Observations[1] + 1)*nPersons[1]
ILcounter <- matrix(1:(length(nPersons)*length(Observations)),
nrow= length(nPersons), ncol= length(Observations), byrow= T)
# ILcounter[i, l] gives list index for DataSets[[index]] to return i = nPersons, l = Observations
# Allows array indexing properties on the Dataset despite it being a list of arrays of unequal dimensions
for(i in 1:length(nPersons)){
for(l in 1:length(Observations)){
if(! (i == 1 & l == 1)){
dims2 = (Observations[l] + 1)*nPersons[i]
dims = c(dims, dims2)
}
}
}
DataSets <- list(1:length(dims))
for(i in 1:length(dims)){
DataSets[[i]] <- array(i, dim = c(dims[i],
numberVaribales + 1 , # numberVariables + 1 col for true cluster membership
# length(nPersons), # index:i
length(nClusters), # index:j
length(ClusterSize), # index: k
# length(Observations), #index: l
length(Distance), # index: m
Replications #index r
))
}
#Main Function ----
for(i in 1:length(nPersons)){
for(j in 1:length(nClusters)){
for(k in 1:length(ClusterSize)){ #Determine # of persons in cluster = nPerClust
nPerClust <- matrix((1-ClusterSize[k])/(nClusters[j]-as.numeric(ClusterSize[k]!=0)), nrow=nClusters[j], ncol=1)
#nPerClust is vector of length nClusters[j]
#contains the remaining percentages distributed equally
if(ClusterSize[k]){ # if first cluster has different percentage, replace equal precentage with it
nPerClust[1] = ClusterSize[k]
}
nPerClust = nPerClust*nPersons[i]
if(nPerClust[1] == 7.5){
nPerClust[sample(1:4, 2, replace = FALSE)] = 8
nPerClust = floor(nPerClust)
}
cat("Pers", nPersons[i], "nClust", nClusters[j], "percentage", ClusterSize[k], nPerClust,"\n")
for(l in 1:length(Observations)){ # Observations
for(m in 1:length(Distance)){ # Distance between clusters
for(r in 1:Replications){
# SunnySideUp contains data for all individuals of a cluster after one another.
# To have a non-ordered dataset where individuals of the same cluster are
# not presented after one another, persons are scrambled unorderly. The result is
# ScrambledData which is the final data set for the condition that is saved in DataSets.
SunnySideUpData <- array(NA,
dim= c((Observations[l] + 1), numberVaribales + 1, nPersons[i]
# numberVaribales +1 so you can save which cluster an indiviual came from
# Observations[l] + 1 because first Y and last X is not used
))
ScrambledData <- array(NA, dim= c((Observations[l] + 1)*nPersons[i],
numberVaribales + 1 # +1 so you can save which cluster an individual came from
))
PeopleAlreadyInThisDataSet <- 0
for(n in 1:c(nClusters[j])){# For every cluster determine Phi_k--------
cat(i, j , k, l, m, r, n , nClusters[j], Distance[m], Observations[l], "\n")
Phi_k <- matrix(NA, ncol = numberVaribales, nrow = numberVaribales)
diag(Phi_k) <- runif(numberVaribales, min= .7, max= .9)
if(Distance[m] == 2){# similiar cluster condition
Phi_k[upper.tri(Phi_k)] <- sample(c(runif(numbPhiOffDiag, min= .3, max= .5),
runif(numbPhiOffDiag, min= 0, max= .2)),
numbPhiOffDiag)
Phi_k[lower.tri(Phi_k)] <- sample(c(runif(numbPhiOffDiag, min= .3, max= .5),
runif(numbPhiOffDiag, min= 0, max= .2)),
numbPhiOffDiag)
# rescaling to ensure stationarity
Phi_k = Phi_k*(.99/max(abs(eigen(Phi_k)$values)))
}else{ # highley similiar or disimilar
Phi_k[upper.tri(Phi_k)] <- runif(numbPhiOffDiag, min= .3, max= .5)
Phi_k[lower.tri(Phi_k)] <- runif(numbPhiOffDiag, min= .3, max= .5)
# rescaling to ensure stationarity
Phi_k = Phi_k*(.99/max(abs(eigen(Phi_k)$values)))
if(Distance[m] == 3){ # highley disimilar: half of cross regressive coeffs are assigned negative signs
indexNegNumbs <- c(sample(x = c(which(upper.tri(Phi_k)), which(lower.tri(Phi_k))), size = numbPhiOffDiag, replace = FALSE))
Phi_k[indexNegNumbs] <- Phi_k[indexNegNumbs]*(-1)
}
} #------
Phi_ks[ , , i, j, k, l, m, n, r] <- Phi_k # save Phi_k
Sigma <- diag(numberVaribales)
#Sigma is the variance/covariance of the white noise series: set to identity
for(q in 1:nPerClust[n]){# n gives the number of current evaluated cluster
cat(q, nPerClust[n], "\n") # q starts at 1 for new cluster, q is used as counter for people in the current cluster
Phi_kq <- array(1, dim = c(numberVaribales, numberVaribales, 1))
while( sum(abs(Phi_kq) >= 1) || max(Mod(eigen(Phi_kq[ , ,1], symmetric = FALSE, only.values = TRUE)$value)) >= 1 ){ # create variation within clusters, sample participants individual Phis from MultiNormal of Phi_k
# Don't leave loop until all elements of Phi_kq have an absolute value smaller than 1
# and until the largest Modulus of eigenvalues of Phi_kq is smaller than 1
Phi_kq <- array(mvrnorm(n = 1, mu = as.vector(Phi_k),
Sigma = (diag(numberVaribales*numberVaribales)*VariancePhi_kqs),
empirical = FALSE),
dim = c(numberVaribales, numberVaribales, 1))
nPhikqGenerated = nPhikqGenerated + 1
}
nPhikqUsed = nPhikqUsed + 1
# the assertions below are always met because of the while loop above:
# stopifnot(max(Mod(eigen(Phi_kq[ , ,1], symmetric = FALSE, only.values = TRUE)$value )) < 1 )
# stopifnot( sum(abs(Phi_kq)>= 1) == 0)
# varima.sim() version 2.1-4 https://github.com/cran/portes/blob/master/R/varima.sim.R
# intercepts are not specified, default value of zero is used
Data <- varima.sim(list(ar = Phi_kq, ma = NULL), n = (Observations[l] +1), k = numberVaribales,
sigma = Sigma)
# Data is array[rows hold numberVariables, col hold Observations]
Data <- cbind(Data, rep(n, (Observations[l] +1)))
SunnySideUpData[ , , PeopleAlreadyInThisDataSet + q] <- Data
}
PeopleAlreadyInThisDataSet <- PeopleAlreadyInThisDataSet + q
}
# Scramble Sunny Side up and save in DataSets so people appear unordered in the final dataset
IndexDataScramble <- c(sample(1:nPersons[i], nPersons[i], replace = FALSE))
ScrambledData <- apply(SunnySideUpData[ , , IndexDataScramble], 2, cbind)
DataSets[[ILcounter[i, l]]][ , , j, k, m, r] <- ScrambledData
cat(i, j, k, l, m, n , r, "\n")
}
}
}
}
}
}
save(DataSets, file= "data/ProbClustDataGenerationDataSets.RData")
save(Phi_ks, file= "data/ProbClustDataGenerationPhi_ks.RData")
PhikqProportion <- list("nPhikqGenerated" = nPhikqGenerated, "nPhikqUsed" = nPhikqUsed)
save(PhikqProportion, file= "data/PhikqProportion.RData")