Predictive-Modeling-and-Bayesian-Inference-in-R/code.R at main · Akbarl414/Predictive-Modeling-and-Bayesian-Inference-in-R · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#' Akbar Latif
#' neg_log_lik
#
#' @description Evaluate the negated log-likelihood for model A and B
#' @param beta A vector with the beta parameters
#' @param data A `data.frame` with the same variables as the `filament1` data set.
#' Must have columns `CAD_Weight` and `Actual_Weight`
#' @param model Either "A" for a log-linear variance model, or "B" for a proportional
#' scaling error model

neg_log_lik <- function(beta, data, model){

  mu <- beta[1] + beta[2]*data[["CAD_Weight"]]

  # distinguish between the two models to find the particular standard deviation for the betas
  if(model == "A") {
    sigma <- sqrt(exp(beta[3] + beta[4]*data[["CAD_Weight"]]))
  }else{
    sigma <- sqrt(exp(beta[3])+exp(beta[4]) * (data[["CAD_Weight"]]^2))
  }
  - sum(dnorm(data[["Actual_Weight"]],
              mean = mu,
              sd=sigma,
              log = TRUE))

}

#' filament_estimate
#
#' @description Estimate filament models with different variance structure
#' @param data A `data.frame` with the same variables as the `filament1` data set.
#' Must have columns `CAD_Weight` and `Actual_Weight`
#' @param model Either "A" for a log-linear variance model, or "B" for a proportional
#' scaling error model
#' @return An estimation object suitable for use with [filament1_predict()]

filament1_estimate <- function(data, model) {
  model <- match.arg(model, c("A", "B"))
  if (model == "A") {
    beta_start <- c(-0.1, 1.07, -2, 0.05)
  } else {
    beta_start <- c(-0.15, 1.07, -13.5, -6.5)
  }
  opt <- optim(beta_start,
               neg_log_lik,
               data = data,
               model = model,
               hessian = TRUE,
               method = "Nelder-Mead",
               control = list(maxit = 5000)
  )
  fit <- list(
    model = model,
    par = opt$par,
    hessian = opt$hessian
  )
  class(fit) <- c("filament1_estimate", "list")
  fit
}

#' filament1_aux_EV
#'
#' @description Evaluate the expectation and variance for model A and B
#' @param beta A vector with the beta parameters
#' @param data A `data.frame` containing the required predictors, including `CAD_Weight`
#' @param model Either "A" for a log-linear variance model, or "B" for a proportional
#' scaling error model
#' @param Sigma_beta : If not NULL, an estimate of the covariance matrix for
#                 the uncertainty of estimated betas
#' @return A list with four elements:
#     E : E(y|beta,x)
#     V : Var(y|beta,x)
#     VE : Var(E(y|beta,x)|x) or NULL
#     EV : E(Var(y|beta,x)|x) or NULL

filament1_aux_EV <- function(beta, data, model = c("A", "B"),
                             Sigma_beta = NULL) {

  model <- match.arg(model)
  if (model == "A") {

    ZE.0 <- model.matrix( ~ 1 + CAD_Weight, data = data)
    ZV.0 <- model.matrix( ~ 1 + CAD_Weight, data = data)
    ZE = cbind(ZE.0, ZV.0 * 0)
    ZV = cbind(ZE.0 * 0, ZV.0)

    VE <- EV <- NULL
    if (!is.null(Sigma_beta)) {
      # E(Var(y|beta,x)|x)
      EV <- exp(ZV %*% beta + rowSums(ZV * (ZV %*% Sigma_beta)) / 2)
      # Var(E(y|beta,x)|x)
      VE <- rowSums(ZE * (ZE %*% Sigma_beta))
    }
    out <- list(
      E = ZE %*% beta,
      V = exp(ZV %*% beta),
      VE = VE,
      EV = EV
    )
  } else {

    ZE.0 <- model.matrix( ~ 1 + CAD_Weight, data = data)
    ZV.0 <- model.matrix( ~ 1 + I(CAD_Weight^2), data = data)
    ZE = cbind(ZE.0, ZV.0 * 0)
    ZV = cbind(ZE.0 * 0, ZV.0)

    VE <- EV <- NULL
    if (!is.null(Sigma_beta)) {
      # E(Var(y|beta,x)|x)
      # (pmin: Ignore large Sigma_beta values)
      EV <- ZV %*% exp(beta + pmin(0.5^2, diag(Sigma_beta)) / 2)
      # Var(E(y|beta,x)|x)
      VE <- rowSums(ZE * (ZE %*% Sigma_beta))
    }
    out <- list(
      E = ZE %*% beta,
      V = ZV %*% exp(beta),
      VE = VE,
      EV = EV
    )
  }
  out
}

#' filament1_predict
#'
#' @description in this function will create a predictive data of the data based on newdata
#' @param data the old data to be predicted upon
#' @param model which model is being used
#' @param newdata the newdata which is being used to predict against the old data
#'
#' @return

filament1_predict <- function(data, model = c("A", "B"), newdata){

  beta <- filament1_estimate(data, model)

  Sigma_beta <- solve(beta$hessian)

  EV <- filament1_aux_EV(beta$par, newdata, model, Sigma_beta)

  sd = sqrt(EV$EV + EV$VE)

  # Compute 95% prediction intervals
  alpha = 0.05
  lwr <- EV$E - qnorm(1 - (alpha/2))*sd
  upr <- EV$E + qnorm(1 - (alpha/2))*sd

  # Create data frame to store results
  results <- data.frame(
    mean = EV$E,
    sd = sd,
    lwr = c(lwr),
    upr = c(upr)
  )

  return(results)
}

#' square_error_score
#'
#' @description
#' @param prediction a data frame with Actual_Weght and mean
#' @return the prediction data frame with a new column
square_error_score <- function(prediction){
  score <- prediction %>%
    mutate(
      se = ((Actual_Weight - mean))^2)
}


#' ds_score
#'
#' @description
#' @param prediction a data frame with Actual_Weght and mean and sd
#' @return the prediction data frame with a new column
ds_score <- function (prediction){
  score <- prediction %>%
    mutate(
  ds = (Actual_Weight - mean)^2/sd^2 + 2 * log(sd))

}

#' leave1out
#'
#' @description
#' @param data the data
#' @param model which model is being used
#' @return the data frame with the

leave1out <- function(data, model = c("A","B")){
  data_new <- data %>% mutate(mean = NA_real_, sd = NA_real_, se = NA_real_, ds = NA_real_)

   for (i in seq_len(nrow(data_new))) {
    pred <- filament1_predict(data_new[-i, , drop = FALSE], model, data_new[i,])
    data_new[i, "mean"] = pred$mean
    data_new[i, "sd"] <- pred$sd
  }

  data_final <- data_new %>% mutate(
    se = ((Actual_Weight - mean))^2,
    ds = (Actual_Weight - mean)^2/sd^2 + 2 * log(sd))

  return(data_final)
}

#' monte_p_value
#'
#' @description Compute p values using monte carlo methods
#' @param dataA the first bit of data to use
#' @param dataB the second bit of data to use
#' @param N the number of samples to be used
#' @return a data frame with the computed se and ds p values

Monte_p <- function(dataA, dataB, N){

  score_diff <- data.frame(se = dataA$se - dataB$se,
                           ds = dataA$ds - dataB$ds)
  statistic0 <- score_diff %>% summarise(se = mean(se), ds = mean(ds))
  statistic <- data.frame(se = numeric(N),
                          ds = numeric(N))
  for (loop in seq_len(N)) {
    random_sign <- sample(c(-1, 1), size = nrow(score_diff), replace = TRUE)
    statistic[loop, ] <- score_diff %>% summarise(se = mean(random_sign * se),
                                                  ds = mean(random_sign * ds))
                          }
  p_values <-
    statistic %>%summarise(se = mean(se > statistic0$se),
                           ds = mean(ds > statistic0$ds))
  # Estimates:
  return(p_values)
}


#' arch_loglike
#'
#' @description compute the combined likelihood
#'
#' @param data is a data frame with columns N, phi
#' @param y is the vector of observations
#' @return

arch_loglike <- function(data, y){
  N <- data$N
  phi <- data$phi


  log_likelihood <- -lgamma(y[1] + 1) - lgamma(y[2] + 1) - lgamma(N - y[1] + 1) - lgamma(N - y[2] + 1) +
    2 * lgamma(N + 1) + (y[1] + y[2]) * log(phi) + (2 * N - y[1] - y[2]) * log(1 - phi)


  return(log_likelihood)
}


#' estimate
#'
#' @description
#' @param y the observations
#' @param xi the prior for n
#' @param a prior for phi
#' @param b prior for phi
#' @param K the number of samples
#' @return

estimate <- function(y, xi, a, b, K){

  # Sample from the prior distributions
  N_samples <- rgeom(K, xi)
  phi_samples <- rbeta(K, a, b)

  # Calculate the unnormalized posterior probabilities
  data <- data.frame(N = N_samples, phi_samples)
  log_posterior <- arch_loglike(data, y)
  posterior_unnormalized <- exp(log_posterior)

  # Calculate the normalization constant
  normalization_constant <- sum(posterior_unnormalized)

  # Calculate posterior probability p(N|y)
  p_N_given_y <- posterior_unnormalized / normalization_constant

  # Calculate expected value of N given y
  E_N_given_y <- sum(N_samples * p_N_given_y)

  # Calculate expected value of phi given y
  E_phi_given_y <- sum(phi_samples * p_N_given_y)

  # Calculate Monte Carlo estimate of p(y)
  p_y <- normalization_constant / K

  # Return results
  results <- list(
    p_y = p_y,
    E_N_given_y = E_N_given_y,
    E_phi_given_y = E_phi_given_y
  )

  return(results)
}