diff --git a/NEWS b/NEWS index 5ee56f26..f5fa3cff 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +# 5.8.2 +- Bugfix: `ldiversity()` now computes distinct l-diversity measure correctly in case of `NAs` in Keyvars + * improved/simplified uderlying C++ code + * added Unit-Tests for `ldiversity()` +- Fixing some Header-Definitions for CRAN-Compliance +- Updated Unit-Tests for `pram()` + # 5.8.1 - New AI-assisted anonymization features: + `AI_createSdcObj()`: LLM-assisted variable classification into SDC roles diff --git a/R/measure_risk.R b/R/measure_risk.R index 2ea565ab..29cd29b8 100644 --- a/R/measure_risk.R +++ b/R/measure_risk.R @@ -306,8 +306,18 @@ measure_riskWORK <- function(data, keyVars, w=NULL, missing=-999, hid=NULL, max_ #' @param missing a integer value to be used as missing value in the C++ routine #' @param ldiv_index indices (or names) of the variables used for l-diversity #' @param l_recurs_c l-Diversity Constant -ldiversity <- function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) { - ldiversityX(obj=obj, ldiv_index=ldiv_index, l_recurs_c=l_recurs_c, missing=missing, ...) +ldiversity <- function(obj, + ldiv_index = NULL, + l_recurs_c = 2, + missing = -999, + ...) { + ldiversityX( + obj = obj, + ldiv_index = ldiv_index, + l_recurs_c = l_recurs_c, + missing = missing, + ... + ) } setGeneric("ldiversityX", function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) { @@ -321,23 +331,27 @@ definition=function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999) { n <- obj@manipNumVars s <- obj@manipStrataVar ldiv_index <- ldiv_index - if ( is.null(ldiv_index) ) { + if (is.null(ldiv_index)) { sensVar <- get.sdcMicroObj(obj, "sensibleVar") - if ( is.null(sensVar) ) { + if (is.null(sensVar)) { err <- paste0("You need to specify argument 'sensibleVar' in 'createSdcObj()'") - err <- paste0(err, " or specify it directly (argument 'ldiv_index') so that the") + err <- paste0(err, + " or specify it directly (argument 'ldiv_index') so that the") err <- paste0(err, " ldiversity risk-measure can be calculated!\n") stop(err) } else{ ldiv_index <- sensVar } } - if (!is.null(k)) + if (!is.null(k)) { o[, colnames(k)] <- k - if (!is.null(n)) + } + if (!is.null(n)) { o[, colnames(n)] <- n - if (!is.null(s)) + } + if (!is.null(s)) { o$sdcGUI_strataVar <- s + } kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")] obj@risk$ldiversity <- ldiversityWORK( data = o, @@ -371,48 +385,83 @@ ldiversityWORK <- function(data, keyVars, ldiv_index, missing=-999, l_recurs_c=2 stop("Please define valid key variables", call. = FALSE) } } + + # Index of sensitive variable(s) if (!is.null(ldiv_index)) { if (is.numeric(ldiv_index)) { ldiv_var <- colnames(data)[ldiv_index] - ldiv_index <- length(variables) + 1:length(ldiv_index) } else if (is.character(ldiv_index)) { ldiv_var <- ldiv_index - ldiv_index <- length(variables) + 1:length(ldiv_index) } - if (any(ldiv_var %in% variables)) + + # Calculate the 1-based index for the C++ matrix (KeyVars + SensVars) + ldiv_index_cpp <- length(variables) + 1:length(ldiv_index) + + if (any(ldiv_var %in% variables)) { stop("Sensitivity variable should not be a keyVariable") - } else ldiv_var <- character(0) + } + } else { + ldiv_var <- character(0) + ldiv_index_cpp <- -99 + } + # Prep data (factors/strings -> numeric) n_key_vars <- length(variables) dataX <- data[, c(variables, ldiv_var), drop=FALSE] for (i in 1:ncol(dataX)) { - if (!is.numeric(dataX[, i])) - dataX[, i] <- as.numeric(unlist(dataX[, i])) + if (!is.numeric(dataX[, i])) { + dataX[, i] <- as.numeric(as.factor(dataX[, i])) + } } dataX <- as.matrix(dataX) - ind <- do.call(order, data.frame(dataX)) - dataX <- dataX[ind, , drop=FALSE] - ind <- order(c(1:nrow(dataX))[ind]) - if (is.null(ldiv_index)) - ldiv_index=-99 - if (length(ldiv_index) > 5) + + # Order data for C++ Function + # Matrix is ordered in a way so that NAs are grouped together for the C++ group-matching + # na.last = TRUE ensures that NAs appear at the end of their respective groups + ind <- do.call(order, c(as.data.frame(dataX), list(na.last = TRUE))) + dataX_sorted <- dataX[ind, , drop = FALSE] + + # We need an index to be able to restore original order after + # calling the c++ function + back_ind <- order(ind) + + # Call C++ function + if (length(ldiv_index_cpp) > 5) { stop("Maximal number of sensitivity variables is 5") - res <- measure_risk_cpp(dataX, 0, n_key_vars, l_recurs_c, ldiv_index, missing) - res$Fk <- res$Res[, 3] - res$Res <- res$Res[ind, ] - if (all(ldiv_index != -99)) { - res$Mat_Risk <- res$Mat_Risk[ind, ] - names(res)[names(res) == "Mat_Risk"] <- "ldiversity" - colnames(res$ldiversity) <- c(paste(rep(ldiv_var, each=3), rep(c("Distinct_Ldiversity", - "Entropy_Ldiversity", "Recursive_Ldiversity"), length(ldiv_index)), sep="_"), - "MultiEntropy_Ldiversity", "MultiRecursive_Ldiversity") + } + + res <- measure_risk_cpp( + data = dataX_sorted, + weighted_R = 0, + n_key_vars_R = n_key_vars, + l_recurs_c_R = l_recurs_c, + ldiv_index_R = ldiv_index_cpp, + missing_value_R = missing + ) + + # Re-order results back to original order + res$Fk <- res$Res[back_ind, 3] + + if (all(ldiv_index_cpp != -99)) { + # Reorder the risk matrix to match original data input + ldiv_mat <- res$Mat_Risk[back_ind, , drop = FALSE] + + # Specifiy column names + col_names <- c(paste(rep(ldiv_var, each = 3), rep( + c( + "Distinct_Ldiversity", + "Entropy_Ldiversity", + "Recursive_Ldiversity" + ), length(ldiv_var)), sep = "_"), + "MultiEntropy_Ldiversity", + "MultiRecursive_Ldiversity") + colnames(ldiv_mat) <- col_names + res_final <- ldiv_mat } else { - res <- res[names(res) != "Mat_Risk"] + res_final <- res$Res[back_ind, ] } - ind <- order(res$Res[, 1], decreasing=TRUE) - res <- res$ldiversity - class(res) <- "ldiversity" - invisible(res) + class(res_final) <- "ldiversity" + invisible(res_final) } #' Print method for objects of class measure_risk diff --git a/src/Framework.h b/src/Framework.h index 6cd1bcc5..3b7a7112 100644 --- a/src/Framework.h +++ b/src/Framework.h @@ -48,7 +48,7 @@ typedef int BOOL; // ============================= Display Messages ================================= inline extern char g_TxtBuffer[1024]; // character TxtBufferfer to display messages -static int OS_Printf(const char *Str, ...); +inline int OS_Printf(const char *Str, ...); // ============================= Assert ================================= #ifdef _DEBUG @@ -252,16 +252,16 @@ inline extern int g_NbNew; #endif #endif // _MSC_VER -static char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn = TRUE); -static char *ReplaceChar(char *Str, char OldChar, char NewChar); -static char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter = FALSE, BOOL ReturnNULL = TRUE); +inline static char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn = TRUE); +inline static char *ReplaceChar(char *Str, char OldChar, char NewChar); +inline static char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter = FALSE, BOOL ReturnNULL = TRUE); - //=== Parsing -static char *RemoveComment(char *Ptr, int Size = -1); // remove text between /* & */ -static char *GoToNextLine(char *Ptr); // renvoie Ptr avanc� jusqu'apr�s le '\n' suivant -static char *GoTo1stChar(char *Ptr); -static char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE); -static char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE); +//=== Parsing +inline static char *RemoveComment(char *Ptr, int Size = -1); // remove text between /* & */ +inline static char *GoToNextLine(char *Ptr); // returns the ptr moved forward to the character right after the next newline +inline static char *GoTo1stChar(char *Ptr); +inline static char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE); +inline static char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE); //============================================= Time function @@ -284,7 +284,7 @@ int gettimeofday(struct timeval *tv, struct timezone *tz); #endif // _MSC_VER -static uint TimeGetMilliSecond(void); +inline static uint TimeGetMilliSecond(void); // ============================= CTooFile ============================= class CTooFile @@ -618,7 +618,7 @@ inline int SubMain(int argc, char *argv[]) #include #endif -char g_TxtBuffer[1024]; // character TxtBufferfer to display messages +inline char g_TxtBuffer[1024]; // character TxtBufferfer to display messages int OS_Printf(const char *Str, ...) { @@ -703,10 +703,7 @@ int stricmp(char *str1, char *str2) } #endif // _MSC_VER -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-function" - -char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn) +inline char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn) { if (Max > 0) { @@ -716,7 +713,7 @@ char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn) return Dst; } -char *ReplaceChar(char *Str, char OldChar, char NewChar) +inline char *ReplaceChar(char *Str, char OldChar, char NewChar) { char *Ret = Str; @@ -731,7 +728,7 @@ char *ReplaceChar(char *Str, char OldChar, char NewChar) return Ret; } -char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL) +inline char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL) { int l = (int) strlen(SubString); @@ -753,7 +750,7 @@ char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL) } ///============================================= Parsing -char *RemoveComment(char *Ptr, int Size) +inline char *RemoveComment(char *Ptr, int Size) { if (Size < 0) Size = (int) strlen(Ptr) + 1; @@ -804,7 +801,7 @@ char *RemoveComment(char *Ptr, int Size) } -char *GoToNextLine(char *Ptr) +inline char *GoToNextLine(char *Ptr) { ASSERT(Ptr != NULL); @@ -829,7 +826,7 @@ char *GoToNextLine(char *Ptr) } -char *GoTo1stChar(char *Ptr) +inline char *GoTo1stChar(char *Ptr) { while ((*Ptr == ' ' || *Ptr == '\t') && *Ptr != 0 && *Ptr != '\r' && *Ptr != '\n') ++Ptr; @@ -838,7 +835,7 @@ char *GoTo1stChar(char *Ptr) } -char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar) +inline char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar) { //BOOL Warn = FALSE; int i = 0; @@ -892,7 +889,7 @@ char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar) } -char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar) +inline char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar) { //BOOL Warn = FALSE; int i = 0; @@ -987,7 +984,7 @@ int gettimeofday(struct timeval *tv, struct timezone *tz) #endif // _MSC_VER -uint TimeGetMilliSecond(void) +inline uint TimeGetMilliSecond(void) { struct timeval tv; @@ -996,8 +993,6 @@ uint TimeGetMilliSecond(void) return (tv.tv_sec & 0x000FFFFFF) * 1000 + tv.tv_usec / 1000; } -#pragma GCC diagnostic pop - // =============================================================================== // // CTooFile diff --git a/src/Measure_Risk.h b/src/Measure_Risk.h index bd8c04d6..a537a4c3 100644 --- a/src/Measure_Risk.h +++ b/src/Measure_Risk.h @@ -1,4 +1,3 @@ - /** * Stata plugin for computing risk of data disclosure * @@ -40,6 +39,24 @@ * */ +#include +#include + +/* Helper for NA-aware key comparison to match R's order(..., na.last=TRUE!) */ +bool items_match_safe(double a, double b) { + if (Rcpp::NumericVector::is_na(a) && Rcpp::NumericVector::is_na(b)) return true; + if (Rcpp::NumericVector::is_na(a) || Rcpp::NumericVector::is_na(b)) return false; + return (std::abs(a - b) < 1e-9); +} + +bool is_same_key_Safe(double *key1, double *key2, int n) { + for (int i = 0; i < n; i++) { + if (!items_match_safe(key1[i], key2[i])) { + return false; + } + } + return true; +} /*=============*/ /* L-DIVERSITY */ @@ -47,113 +64,85 @@ /** * Computes the l-diversity values for the current group */ -void compute_group_ldiversity(int group_size, SVariable *pSensitive_Var, int NbVar = MAX_SENSITIVE_VAR) -{ - int i, j, k; - SCategory *pCat; - int count; - double entropy_ratio; - double entropy; - - for (i = 0; i < NbVar; i++) - { - SVariable &Var = pSensitive_Var[i]; - - if (!Var.Require_Ldiversity) - continue; - - count = 0; - entropy = 0.0; - pCat = Var.pFirstCategory; - - while (pCat != NULL) - { - if (pCat->group_freq > 0) - { - // distinct diversity count - count++; - - // entropy l-diversity - entropy_ratio = pCat->group_freq / (double)group_size; - entropy += entropy_ratio * log(entropy_ratio); - } - - pCat = pCat->pNext; - } - - // include missing values (unless no other value exists) - if (Var.Nb_Missing_Value_In_Group > 0 && count > 0) - { - count++; - entropy_ratio = Var.Nb_Missing_Value_In_Group / (double)group_size; - entropy += entropy_ratio * log(entropy_ratio); - } - - // update variable - Var.Group_Distinct_Ldiversity = count; - Var.Group_Entropy_Ldiversity = exp(-entropy); - - //===TOO: recursive l-diversity - //=== Put all unused Categories at the end of the list - int NbUsedCategory = 0; - //SCategory *pLastCategoryBak = Var.pLastCategory, - // *pPrevCategory = NULL; - - pCat = Var.pFirstCategory; - - while (pCat) - { - if (pCat->group_freq) - ++NbUsedCategory; - - pCat = pCat->pNext; - } - - //=== Bubble sort the used categories by the frequency - if (!NbUsedCategory) - continue; - - SCategory **ppUsedCat = (SCategory **) malloc(sizeof(SCategory *) * NbUsedCategory); - pCat = Var.pFirstCategory; - - for (j = 0; pCat; pCat = pCat->pNext) - { - if (pCat->group_freq) - ppUsedCat[j++] = pCat; - } - - for (j = 0; j < NbUsedCategory; ++j) - { - for (k = j + 1; k < NbUsedCategory; ++k) - { - if (ppUsedCat[j]->group_freq < ppUsedCat[k]->group_freq) - Swap(ppUsedCat[k], ppUsedCat[j]); - } - } - - //=== recursive l-diversity - ASSERT(NbUsedCategory >= 1); - - if (NbUsedCategory > 1) - { - float c = g_Config.Ldiversity_Recursivity_Constant; - float Sum = 0.0f; - for (j = 1; j < NbUsedCategory; ++j) - Sum += ppUsedCat[j]->group_freq; - - Sum *= c; - - for (j = 1; j < NbUsedCategory && Sum > ppUsedCat[0]->group_freq; ++j) - Sum -= c * ppUsedCat[j]->group_freq; - - Var.Group_Recursive_Ldiversity = j; - } - else - Var.Group_Recursive_Ldiversity = 1; - - free(ppUsedCat); - - } +inline void compute_group_ldiversity(int group_size, SVariable *pSensitive_Var, int NbVar = MAX_SENSITIVE_VAR) { + int i, j, k; + SCategory *pCat; + int count; + double entropy_ratio; + double entropy; + + for (i = 0; i < MAX_SENSITIVE_VAR; i++) { + SVariable &Var = pSensitive_Var[i]; + if (!Var.Require_Ldiversity) continue; + + count = 0; + entropy = 0.0; + pCat = Var.pFirstCategory; + + while (pCat != NULL) { + if (pCat->group_freq > 0) { + count++; + entropy_ratio = (double)pCat->group_freq / (double)group_size; + entropy += entropy_ratio * log(entropy_ratio); + } + pCat = pCat->pNext; + } + + if (Var.Nb_Missing_Value_In_Group > 0 && count > 0) { + count++; + entropy_ratio = (double)Var.Nb_Missing_Value_In_Group / (double)group_size; + entropy += entropy_ratio * log(entropy_ratio); + } + + Var.Group_Distinct_Ldiversity = (double)count; + Var.Group_Entropy_Ldiversity = exp(-entropy); + + int NbUsedCategory = 0; + pCat = Var.pFirstCategory; + while (pCat) { + if (pCat->group_freq > 0) NbUsedCategory++; + pCat = pCat->pNext; + } + + if (NbUsedCategory == 0) continue; + + SCategory **ppUsedCat = (SCategory **)malloc(sizeof(SCategory *) * NbUsedCategory); + pCat = Var.pFirstCategory; + j = 0; + while (pCat) { + if (pCat->group_freq > 0) { + ppUsedCat[j++] = pCat; + } + pCat = pCat->pNext; + } + + for (j = 0; j < NbUsedCategory; ++j) { + for (k = j + 1; k < NbUsedCategory; ++k) { + if (ppUsedCat[j]->group_freq < ppUsedCat[k]->group_freq) { + Swap(ppUsedCat[k], ppUsedCat[j]); + } + } + } + + if (NbUsedCategory > 1) { + float c = g_Config.Ldiversity_Recursivity_Constant; + float Sum = 0.0f; + for (j = 1; j < NbUsedCategory; ++j) { + Sum += (float)ppUsedCat[j]->group_freq; + } + + Sum *= c; + int rank = 1; + for (j = 1; j < NbUsedCategory && Sum > (float)ppUsedCat[0]->group_freq; ++j) { + Sum -= c * (float)ppUsedCat[j]->group_freq; + rank++; + } + Var.Group_Recursive_Ldiversity = rank; + } else { + Var.Group_Recursive_Ldiversity = 1; + } + free(ppUsedCat); + } } /*===================*/ @@ -162,676 +151,230 @@ void compute_group_ldiversity(int group_size, SVariable *pSensitive_Var, int NbV /** * Computes the multi l-diversity values for the current group */ -void Compute_Multi_LDiversity(int Obs, int GroupSize,Rcpp::NumericMatrix Mat,Rcpp::NumericVector indexSensVar) -{ - int i, j, k, l; - const int NbVar = g_Config.Nb_Sensitive_Var; - double *pSet = new double[GroupSize * NbVar]; - int var_pos; - int *pSetIndex = new int[GroupSize]; - BOOL First = TRUE; - //=== Load the SubSet - ForLoop (i, GroupSize) - { - ForLoop (j, NbVar){ - var_pos=indexSensVar(j)-1; - pSet[i * NbVar + j] = Mat(Obs + i, var_pos); - } - //g_pDataset->GetValue(g_Config.Sensitive_Var[j].position, Obs + i, pSet + i * NbVar + j); - - pSetIndex[i] = i; - } - - ForLoop (i, NbVar) - { - SVariable &VarI = g_Config.Sensitive_Var[i]; - - if (!VarI.Require_Ldiversity) - continue; - - //=== Bubble sort by all Vars except VarI - ForLoop (j, GroupSize - 1) - { - for (k = j + 1; k < GroupSize; ++k) - { - int &IndexJ = pSetIndex[j], - &IndexK = pSetIndex[k]; - - BOOL DoSwap = FALSE; - - ForLoop (l, NbVar) - { - if (l == i) // ignore current VarI - continue; - - if (pSet[IndexK * NbVar + l] == pSet[IndexJ * NbVar + l]) - continue; - - if (pSet[IndexK * NbVar + l] > pSet[IndexJ * NbVar + l]) - DoSwap = TRUE; - - break; - } - - if (DoSwap) - Swap(IndexJ, IndexK); - } - } - - //=== Count How many SubGroups - int NbSubGroup = 1; - int Index1 = pSetIndex[0]; - - for (j = 1; j < GroupSize; ++j) - { - int Index2 = pSetIndex[j]; - - ForLoop (l, NbVar) - { - if (l == i) // ignore current VarI - continue; - - if (pSet[Index2 * NbVar + l] != pSet[Index1 * NbVar + l]) - break; - } - - if (l < NbVar) // Different ? - { - Index1 = Index2; - ++NbSubGroup; - } - } - - - //=== Compute Entropy & Recursive for each SubGroup - SVariable Var; - - init_var(&Var); - reset_var_cat_group_freq(&Var); - - Var.Require_Ldiversity = VarI.Require_Ldiversity; - - float *pSubEntropy = new float[NbSubGroup]; - int *pSubRecursive = new int[NbSubGroup]; - - int i1 = 0; -// int NbSubGroupBak = NbSubGroup; - NbSubGroup = 0; - - for (j = 1; j < GroupSize + 1; ++j) - { - int Index2 = pSetIndex[j]; - Index1 = pSetIndex[i1]; - - - if (j < GroupSize) - { - ForLoop (l, NbVar) - { - if (l == i) // ignore current VarI - continue; - - if (pSet[Index2 * NbVar + l] != pSet[Index1 * NbVar + l]) - break; - } - } - else - l = 0; - - add_var_cat_value(&Var, pSet[pSetIndex[j-1] * NbVar + i]); - - if (l < NbVar) // End of Group ? - { - int SubGroupSize = j - i1; - compute_group_ldiversity(SubGroupSize, &Var, 1); - - pSubEntropy[NbSubGroup] = (float) Var.Group_Entropy_Ldiversity; - pSubRecursive[NbSubGroup] = Var.Group_Recursive_Ldiversity; - - i1 = j; - reset_var_cat_group_freq(&Var); - ++NbSubGroup; - } - } - - - float MultiEntropy = pSubEntropy[0]; - int MultiRecursive = pSubRecursive[0]; - - for (j = 1; j < NbSubGroup; ++j) - { - MultiEntropy = Min(pSubEntropy[j], MultiEntropy); - MultiRecursive = Min(pSubRecursive[j], MultiRecursive); - } - - - if (First) - { - g_Config.Group_MultiEntropy_Ldiversity = MultiEntropy; - g_Config.Group_MultiRecursive_Ldiversity = MultiRecursive; - First = FALSE; - } - else - { - g_Config.Group_MultiEntropy_Ldiversity = Min(MultiEntropy, g_Config.Group_MultiEntropy_Ldiversity); - g_Config.Group_MultiRecursive_Ldiversity = Min(MultiRecursive, g_Config.Group_MultiRecursive_Ldiversity); - } - - delete[] pSubEntropy; - delete[] pSubRecursive; - - free_var(&Var); - } - - delete[] pSet; - delete[] pSetIndex; +void Compute_Multi_LDiversity(int Obs, int GroupSize, Rcpp::NumericMatrix Mat, Rcpp::NumericVector indexSensVar) { + int i, j, k, l; + const int NbVar = g_Config.Nb_Sensitive_Var; + double *pSet = new double[GroupSize * NbVar]; + int *pSetIndex = new int[GroupSize]; + BOOL First = TRUE; + + for (i = 0; i < GroupSize; i++) { + for (j = 0; j < NbVar; j++) { + pSet[i * NbVar + j] = Mat(Obs + i, (int)indexSensVar(j) - 1); + } + pSetIndex[i] = i; + } + + for (i = 0; i < NbVar; i++) { + SVariable &VarI = g_Config.Sensitive_Var[i]; + if (!VarI.Require_Ldiversity) continue; + + for (j = 0; j < GroupSize - 1; j++) { + for (k = j + 1; k < GroupSize; k++) { + BOOL DoSwap = FALSE; + for (l = 0; l < NbVar; l++) { + if (l == i) continue; + if (pSet[pSetIndex[k] * NbVar + l] > pSet[pSetIndex[j] * NbVar + l]) { + DoSwap = TRUE; + break; + } + if (pSet[pSetIndex[k] * NbVar + l] < pSet[pSetIndex[j] * NbVar + l]) break; + } + if (DoSwap) { + Swap(pSetIndex[j], pSetIndex[k]); + } + } + } + + int i1 = 0; + SVariable Var; + init_var(&Var); + + std::vector subEntropies; + std::vector subRecursives; + + for (j = 1; j <= GroupSize; j++) { + add_var_cat_value(&Var, pSet[pSetIndex[j - 1] * NbVar + i]); + + bool endOfSub = (j == GroupSize); + if (!endOfSub) { + for (l = 0; l < NbVar; l++) { + if (l == i) continue; + if (pSet[pSetIndex[j] * NbVar + l] != pSet[pSetIndex[i1] * NbVar + l]) { + endOfSub = true; + break; + } + } + } + + if (endOfSub) { + compute_group_ldiversity(j - i1, &Var, 1); + subEntropies.push_back((float)Var.Group_Entropy_Ldiversity); + subRecursives.push_back((int)Var.Group_Recursive_Ldiversity); + i1 = j; + free_var(&Var); + init_var(&Var); + } + } + + float minEnt = subEntropies[0]; + int minRec = subRecursives[0]; + for (size_t s = 1; s < subEntropies.size(); s++) { + minEnt = Min(minEnt, subEntropies[s]); + minRec = Min(minRec, subRecursives[s]); + } + + if (First) { + g_Config.Group_MultiEntropy_Ldiversity = minEnt; + g_Config.Group_MultiRecursive_Ldiversity = (float)minRec; + First = FALSE; + } else { + g_Config.Group_MultiEntropy_Ldiversity = Min(minEnt, (float)g_Config.Group_MultiEntropy_Ldiversity); + g_Config.Group_MultiRecursive_Ldiversity = Min((float)minRec, (float)g_Config.Group_MultiRecursive_Ldiversity); + } + free_var(&Var); + } + delete[] pSet; + delete[] pSetIndex; } /*======*/ /* RISK */ /*======*/ -/** - * Computes the risk based on frequency and weight - */ -double compute_risk(int freq, double weight) -{ - //int i; - int tmp; - double risk; - double pk = freq / weight; - double qk = 1 - pk; - if(1){//freq>weight){ - pk=pk-0.0001; - risk=0; - if( freq > 2 ){ - risk = pk / (freq - (1-pk)); - } - if( freq == 2 ){ - risk = (pk/(1-pk)) - pow(pk/(1-pk),2) * log(1/pk); - } - if( freq == 1 ){ - risk = (pk/(1-pk)) * log(1/pk); - } - }else{ - // compute risk (see micro-Argus 4.1 manual, p21+) - switch (freq) - { - case 1: - risk = -log(pk); - risk *= (pk / qk); - break; - case 2: - risk = (pk * log(pk)) + qk; - risk *= (pk) / (qk * qk); - break; - case 3: - risk = qk * ((3 * qk) - 2); - risk -= 2 * pk * pk * log(pk); - risk *= (pk / (2 * pow(qk, 3))); - break; - default: - risk = 1; - tmp = freq + 1; - risk += qk / tmp; - tmp *= freq + 2; - risk += (2 * pow(qk, 2)) / tmp; - tmp *= freq + 3; - risk += (6 * pow(qk, 3)) / tmp; - tmp *= freq + 4; - risk += (24 * pow(qk, 4)) / tmp; - tmp *= freq + 5; - risk += (120 * pow(qk, 5)) / tmp; - tmp *= freq + 6; - risk += (720 * pow(qk, 6)) / tmp; - tmp *= freq + 7; - risk += (5040 * pow(qk, 7)) / tmp; - risk *= pk / freq; - break; - } - } - return risk; - +/* simplified; removing never-used argus implementation */ +inline double compute_risk(int freq, double weight) { + // Calculate base pk + double pk = (double)freq / weight; + + pk = pk - 0.0001; + if (freq > 2) { + return pk / (freq - (1.0 - pk)); + } + + if (freq == 2) { + double ratio = pk / (1.0 - pk); + return ratio - pow(ratio, 2.0) * log(1.0 / pk); + } + + if (freq == 1) { + return (pk / (1.0 - pk)) * log(1.0 / pk); + } + // Default fallback if freq is 0 or negative + return 0.0; } - - /*======*/ /* MAIN */ /*======*/ -// int argc, char *argv[] // [[Rcpp::export]] -RcppExport SEXP measure_risk_cpp(SEXP data, SEXP weighted_R, SEXP n_key_vars_R, SEXP l_recurs_c_R, SEXP ldiv_index_R, SEXP missing_value_R) -{ - int i;//, j, k; - /*========*/ - /* CONFIG */ - /*========*/ - - Rcpp::NumericMatrix Mat(data); - // Result matrix for group_count and group_risk; - Rcpp::NumericMatrix Res(Mat.rows(), 3); - int NbRow = Mat.rows(); - int NbCol = Mat.cols(); - //GetDataSet(); - //int NbRow = g_pDataset->GetNbRow(); - //g_Config.is_weighted = 0; - //g_Config.Nb_QuasiId_Var = 0; - g_Config.Nb_QuasiId_Var = Rcpp::as(n_key_vars_R); - g_Config.Nb_Sensitive_Var = 0; - g_Config.weight_var_pos = 0; - g_Config.missing_value = Rcpp::as(missing_value_R); - - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - init_var(&g_Config.Sensitive_Var[i]); - - // g_Config.Ldiversity_Recursivity_Constant = 1.0f; - - bool weighted = Rcpp::as(weighted_R); - g_Config.is_weighted = (weighted) ? 1 : 0; - - g_Config.Ldiversity_Recursivity_Constant = Rcpp::as(l_recurs_c_R); - - if (!g_Config.Ldiversity_Recursivity_Constant) - { - g_Config.Ldiversity_Recursivity_Constant = 1.0f; - } - // look for l-diversity arguments ldiv? - Rcpp::NumericVector ldiv_index_RR(ldiv_index_R); - int length_ldiv_index=ldiv_index_RR.length(); - if(ldiv_index_RR(0)>0){ - ForLoopD(ll, length_ldiv_index){ - int index_run = ldiv_index_RR(ll)-1; - g_Config.Sensitive_Var[index_run].Require_Ldiversity = TRUE; - } - } - Rcpp::NumericMatrix Mat_Risk(Mat.rows(), length_ldiv_index*3+2); - - // count number of l-diversity variables - // and initialize ldiv variable position - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - ++g_Config.Nb_Sensitive_Var; - } - // setup g_Config - if (g_Config.is_weighted) - { - // weighted - g_Config.Nb_QuasiId_Var = NbCol-1; // all variables except weight, group and risk - g_Config.weight_var_pos = g_Config.Nb_QuasiId_Var + 0; - - // if (g_pDataset->GetNbVar() < 4) - if (NbCol < 2) - { - return Rcpp::wrap(-1); - } - } - else - { - // unweighted - g_Config.weight_var_pos = 0; - - // init ldiv variable positions - int llll=0; - int var_pos; - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - { - var_pos=ldiv_index_RR(llll)-1; - g_Config.Sensitive_Var[i].position = var_pos; - llll++; - } - } - } - /*=========*/ - /* PROCESS */ - /*=========*/ - //if (g_pDataset->GetNbVar() < 2) - if (NbCol < 2) - { - } - else - { - //double group_key[g_Config.Nb_QuasiId_Var]; //< the current group identification key - //double obs_key[g_Config.Nb_QuasiId_Var]; //< the current observation identification key - - double *group_key = new double[g_Config.Nb_QuasiId_Var]; //< the current group identification key - double *obs_key = new double[g_Config.Nb_QuasiId_Var]; - - long current_obs; - long obs_count = 0; // total number of observations - double obs_weight; //< weight of the current observation - double obs_value; //< generic value - long group_count = 0; //< total number of groups - long group_missing; //< used to detect missing values in group key - int group_size; - double group_weight; //< the sum of the weights for this group - double group_risk; - - double global_risk_ER = 0.0; //< The expected number of re-identification - double global_risk = 0.0; //< The re-identification rate or global risk - int i;//, j; - // get first observation - current_obs = 0;//SF_GetRowStart(); - do - { - - // Init group - group_count++; - - group_size = 0; - group_weight = 0.0; - group_missing = 0; - - // reset ldiv counts - if (!g_Config.is_weighted && g_Config.Nb_Sensitive_Var > 0) - { - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - reset_var_cat_group_freq(&g_Config.Sensitive_Var[i]); - } - } - - // read the group key - for (i = 0; i < g_Config.Nb_QuasiId_Var; i++) - { - group_key[i] = Mat(current_obs, i); - //g_pDataset->GetValue(i, current_obs, &group_key[i]); - if (SF_IsMissing(group_key[i])) - group_missing++; - } - if (group_missing == g_Config.Nb_QuasiId_Var) - { - // - // CASE 1: ALL MISSING VALUES IN KEY (risk is zero) - // - // all the key variables are missing, the risk is zero - group_risk = 0; - - // UPDATE STATA - do - { - Res(current_obs, 0) = group_count; - Res(current_obs, 1) = group_risk; - Res(current_obs, 2) = group_size; - - // l-diversity --> do not compute - obs_count++; - current_obs++; - - if (current_obs >= NbRow) - break; - - // read next obs key - for (i = 0; i < g_Config.Nb_QuasiId_Var; i++) - obs_key[i] = Mat(current_obs, i); - //g_pDataset->GetValue(i, current_obs, &obs_key[i]); - } - while (is_same_key_Risk(group_key, obs_key, g_Config.Nb_QuasiId_Var)); - } - else if (group_missing > 0) - { - // - // CASE 2: SOME MISSING VALUES IN KEY - // - // display_key(group_key, g_Config.Nb_QuasiId_Var); - - // The weights and count of the observations with the same - // non-missing key values need to be used for this group - // This requires a full scan of the dataset - int i, j; // local definitions to prevent conflicts - double value; - - // for (i = SF_GetRowStart(); i <= SF_GetRowEnd(); i++) - ForLoop (i, NbRow) - { - // if (g_pDataset->IsRowSelected(i)) - Always TRUE - // { - // compare partial keys - for (j = 0; j < g_Config.Nb_QuasiId_Var; j++) - { - // if this variable is a missing component of the current key, ignore it - if (SF_IsMissing(group_key[j])) - continue; - - // read this variable value - value = Mat(i, j); - // g_pDataset->GetValue(j, i, &value); - - // if not equal to the current key, this is not a match - if (value != group_key[j]) - break; - } - - if (j == g_Config.Nb_QuasiId_Var) - { - // we didn't hit the break, this is a match - group_size++; - if (g_Config.is_weighted) - { - value = Mat(i, g_Config.weight_var_pos); - //g_pDataset->GetValue(g_Config.weight_var_pos, i, &value); - group_weight += value; - } - - // if unweighted data and sensitive variables exist - if (!g_Config.is_weighted && g_Config.Nb_QuasiId_Var > 0) - { - //add l-diversity category frequency - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - // for active variable only - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - { - // read value for this variable and add to category count - obs_value = Mat(current_obs, g_Config.Sensitive_Var[i].position); - add_var_cat_value(&g_Config.Sensitive_Var[i], obs_value); - } - } - } - } - // } - } - - // compute risk - if (g_Config.is_weighted) - { - group_risk = compute_risk(group_size, group_weight); - } - else - { - group_risk = 1 / group_size; - - // compute group l-diversity - if (g_Config.Nb_Sensitive_Var > 0) - { - compute_group_ldiversity(group_size, g_Config.Sensitive_Var); - - } - } - - - // UPDATE STATA - do - { - // add this observation contribution to the global risk - global_risk_ER += group_size * group_risk; - - Res(current_obs, 0) = group_count; - Res(current_obs, 1) = group_risk; - Res(current_obs, 2) = group_size; - - // if unweighted data and sensitive variables exist - if (!g_Config.is_weighted && g_Config.Nb_Sensitive_Var > 0) - { - int ii; - int ind_sens=0; - for (ii = 0; ii < MAX_SENSITIVE_VAR; ii++) - { - if (g_Config.Sensitive_Var[ii].Require_Ldiversity) - { - Mat_Risk(i, (ind_sens*3)) = g_Config.Sensitive_Var[ii].Group_Distinct_Ldiversity; - Mat_Risk(i, (ind_sens*3)+1) = g_Config.Sensitive_Var[ii].Group_Entropy_Ldiversity; - Mat_Risk(i, (ind_sens*3)+2) = g_Config.Sensitive_Var[ii].Group_Recursive_Ldiversity; - ind_sens++; - } - } - - if (g_Config.Nb_Sensitive_Var >= 2) - { - Mat_Risk(i, Mat_Risk.cols()-2) = g_Config.Group_MultiEntropy_Ldiversity; - Mat_Risk(i, Mat_Risk.cols()-1) = g_Config.Group_MultiRecursive_Ldiversity; - } - } - - obs_count++; - current_obs++; - - if (current_obs >= NbRow) - break; - - // read next obs key - for (i = 0; i < g_Config.Nb_QuasiId_Var; i++) - obs_key[i] = Mat(current_obs, i); - //g_pDataset->GetValue(i, current_obs, &obs_key[i]); - - } - while (is_same_key_Risk(group_key, obs_key, g_Config.Nb_QuasiId_Var)); - } - else - { - // - // CASE 3: FULL KEY (NORMAL CASE) - // - // read all observations for this group - do - { - obs_count++; - if (g_Config.is_weighted) - { - // add to group weight - obs_weight = Mat(current_obs, g_Config.weight_var_pos); - //g_pDataset->GetValue(g_Config.weight_var_pos, current_obs, &obs_weight); - group_weight += obs_weight; - } - else - { - if (g_Config.Nb_Sensitive_Var > 0) - { - // add l-diversity category frequency - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - { - // read value for this variable and add to category count - obs_value = Mat(current_obs, g_Config.Sensitive_Var[i].position); - //g_pDataset->GetValue(g_Config.Sensitive_Var[i].position, current_obs, &obs_value); - add_var_cat_value(&g_Config.Sensitive_Var[i], obs_value); - } - } - } - } - - // add to group frequency - group_size++; - - // next - current_obs++; - - if (current_obs >= NbRow) - break; - - // read next obs key - for (i = 0; i < g_Config.Nb_QuasiId_Var; i++) - obs_key[i] = Mat(current_obs, i); - //g_pDataset->GetValue(i, current_obs, &obs_key[i]); - - } - while (is_same_key_Risk(group_key, obs_key, g_Config.Nb_QuasiId_Var)); - // compute risk for this group - if (g_Config.is_weighted) - { - group_risk = compute_risk(group_size, group_weight); - } - else - { - group_risk = 1.0 / group_size; - - // compute group l-diversity - if (g_Config.Nb_Sensitive_Var > 0) - { - compute_group_ldiversity(group_size, g_Config.Sensitive_Var); - - - if (g_Config.Nb_Sensitive_Var >= 2){ - Compute_Multi_LDiversity(current_obs - group_size, group_size,Mat,ldiv_index_RR); - - } - } - } - for (i = current_obs - group_size; i < current_obs; i++) - { - // Write value back to stata file for all observations in the group - Res(i, 0) = group_count; - Res(i, 1) = group_risk; - Res(i, 2) = group_size; - - - // l-diversity - if (!g_Config.is_weighted && g_Config.Nb_Sensitive_Var > 0){ - int ind_sens2=0; - for (int ii = 0; ii < MAX_SENSITIVE_VAR; ii++) - { - if (g_Config.Sensitive_Var[ii].Require_Ldiversity) - { - Mat_Risk(i, (ind_sens2*3)) = g_Config.Sensitive_Var[ii].Group_Distinct_Ldiversity; - Mat_Risk(i, (ind_sens2*3)+1) = g_Config.Sensitive_Var[ii].Group_Entropy_Ldiversity; - Mat_Risk(i, (ind_sens2*3)+2) = g_Config.Sensitive_Var[ii].Group_Recursive_Ldiversity; - ind_sens2++; - } - } - } - - if (g_Config.Nb_Sensitive_Var >= 2) - { - Mat_Risk(i, Mat_Risk.cols()-2) = g_Config.Group_MultiEntropy_Ldiversity; - Mat_Risk(i, Mat_Risk.cols()-1) = g_Config.Group_MultiRecursive_Ldiversity; - } - // stata_update_ldiversity(i); - } - - // add this group contribution to the global risk - global_risk_ER += group_size * group_risk; - } - } - while (current_obs < NbRow); - // compute global risk and store in stata scalars - global_risk = global_risk_ER / obs_count; - // SF_ScalarSave("global_risk_ER", global_risk_ER); - // SF_ScalarSave("global_risk", global_risk); - // SF_ScalarSave("global_risk_pct", global_risk * 100); - //Rcpp::NumericVector global_risk_ER_rcpp(global_risk_ER); - //Rcpp::NumericVector global_risk_rcpp(global_risk); - //Rcpp::NumericVector global_risk_pct_rcpp(global_risk * 100); - // clean memory - for (i = 0; i < MAX_SENSITIVE_VAR; i++) - { - if (g_Config.Sensitive_Var[i].Require_Ldiversity) - free_var(&g_Config.Sensitive_Var[i]); - } - - delete[] group_key; - delete[] obs_key; - - return Rcpp::List::create( - Rcpp::Named( "global_risk_ER" ) = global_risk_ER, - Rcpp::Named( "global_risk" ) = global_risk, - Rcpp::Named( "global_risk_pct" ) = global_risk*100, - Rcpp::Named( "Res") = Res, - Rcpp::Named( "Mat_Risk") = Mat_Risk - ); - } - return 0; +RcppExport SEXP measure_risk_cpp(SEXP data, SEXP weighted_R, SEXP n_key_vars_R, SEXP l_recurs_c_R, SEXP ldiv_index_R, SEXP missing_value_R) { + Rcpp::NumericMatrix Mat(data); + Rcpp::NumericMatrix Res(Mat.rows(), 3); + int NbRow = Mat.rows(); + int NbCol = Mat.cols(); + + g_Config.Nb_QuasiId_Var = Rcpp::as(n_key_vars_R); + g_Config.missing_value = Rcpp::as(missing_value_R); + g_Config.is_weighted = Rcpp::as(weighted_R) ? 1 : 0; + g_Config.Ldiversity_Recursivity_Constant = Rcpp::as(l_recurs_c_R); + if (g_Config.Ldiversity_Recursivity_Constant <= 0) { + g_Config.Ldiversity_Recursivity_Constant = 1.0f; + } + + Rcpp::NumericVector ldiv_index_RR(ldiv_index_R); + int n_ldiv = (ldiv_index_RR(0) > 0) ? ldiv_index_RR.length() : 0; + Rcpp::NumericMatrix Mat_Risk(NbRow, n_ldiv * 3 + 2); + + for (int i = 0; i < MAX_SENSITIVE_VAR; i++) { + init_var(&g_Config.Sensitive_Var[i]); + g_Config.Sensitive_Var[i].Require_Ldiversity = FALSE; + } + + if (n_ldiv > 0) { + for (int i = 0; i < n_ldiv; i++) { + g_Config.Sensitive_Var[i].Require_Ldiversity = TRUE; + g_Config.Sensitive_Var[i].position = (int)ldiv_index_RR(i) - 1; + } + } + + double *group_key = new double[g_Config.Nb_QuasiId_Var]; + double *obs_key = new double[g_Config.Nb_QuasiId_Var]; + int current_obs = 0; + int group_count = 0; + double global_risk_ER = 0.0; + + while (current_obs < NbRow) { + group_count++; + int block_start = current_obs; + int group_size = 0; + double group_weight = 0.0; + + for (int i = 0; i < g_Config.Nb_QuasiId_Var; i++) { + group_key[i] = Mat(current_obs, i); + } + + // Reset sensitive vars and missing counters for each group + for (int i = 0; i < n_ldiv; i++) { + free_var(&g_Config.Sensitive_Var[i]); + init_var(&g_Config.Sensitive_Var[i]); + g_Config.Sensitive_Var[i].Nb_Missing_Value_In_Group = 0; + g_Config.Sensitive_Var[i].Require_Ldiversity = TRUE; + g_Config.Sensitive_Var[i].position = (int)ldiv_index_RR(i) - 1; + } + + do { + if (g_Config.is_weighted) { + group_weight += Mat(current_obs, NbCol - 1); + } + + for (int i = 0; i < n_ldiv; i++) { + add_var_cat_value(&g_Config.Sensitive_Var[i], Mat(current_obs, g_Config.Sensitive_Var[i].position)); + } + group_size++; + current_obs++; + if (current_obs >= NbRow) break; + for (int i = 0; i < g_Config.Nb_QuasiId_Var; i++) { + obs_key[i] = Mat(current_obs, i); + } + } while (is_same_key_Safe(group_key, obs_key, g_Config.Nb_QuasiId_Var)); + + double group_risk = g_Config.is_weighted ? compute_risk(group_size, group_weight) : 1.0 / group_size; + global_risk_ER += group_size * group_risk; + + if (n_ldiv > 0) { + compute_group_ldiversity(group_size, g_Config.Sensitive_Var, n_ldiv); + if (n_ldiv >= 2) { + Compute_Multi_LDiversity(block_start, group_size, Mat, ldiv_index_RR); + } + } + + for (int i = block_start; i < current_obs; i++) { + Res(i, 0) = group_count; + Res(i, 1) = group_risk; + Res(i, 2) = group_size; + for (int v = 0; v < n_ldiv; v++) { + Mat_Risk(i, v * 3) = g_Config.Sensitive_Var[v].Group_Distinct_Ldiversity; + Mat_Risk(i, v * 3 + 1) = g_Config.Sensitive_Var[v].Group_Entropy_Ldiversity; + Mat_Risk(i, v * 3 + 2) = g_Config.Sensitive_Var[v].Group_Recursive_Ldiversity; + } + if (n_ldiv >= 2) { + Mat_Risk(i, Mat_Risk.cols() - 2) = (double)g_Config.Group_MultiEntropy_Ldiversity; + Mat_Risk(i, Mat_Risk.cols() - 1) = (double)g_Config.Group_MultiRecursive_Ldiversity; + } + } + } + + delete[] group_key; + delete[] obs_key; + for (int i = 0; i < MAX_SENSITIVE_VAR; i++) free_var(&g_Config.Sensitive_Var[i]); + + double obs_count_final = (double)NbRow; + double final_global_risk = global_risk_ER / obs_count_final; + return Rcpp::List::create( + Rcpp::Named("global_risk_ER") = global_risk_ER, + Rcpp::Named("global_risk") = final_global_risk, + Rcpp::Named("global_risk_pct") = final_global_risk * 100.0, + Rcpp::Named("Res") = Res, + Rcpp::Named("Mat_Risk") = Mat_Risk + ); } diff --git a/tests/testthat/test_ldiversity.R b/tests/testthat/test_ldiversity.R new file mode 100644 index 00000000..8a1bb798 --- /dev/null +++ b/tests/testthat/test_ldiversity.R @@ -0,0 +1,76 @@ +context("test ldiversity()") +library(sdcMicro) + +test_that("ldiversityWORK handles NAs and distinct counts correctly", { + # 1. Setup the specific data frame that previously failed + test_data <- data.frame( + sex = c("female", "female", "female", "male", "male", "female", "female"), + occupation = c(NA, NA, NA, "teacher", "teacher", "nurse", "nurse"), + ethnicity = c("other", "other", "other", "other", "other", "majority", "majority"), + sensitive = c(1, 0, 2, 1, 0, 0, 1) + ) + + # 2. Run the function + # keyVars: sex, occupation, ethnicity (col 1:3) + # ldiv_index: sensitive (col 4) + res <- ldiversity( + obj = test_data, + keyVars = 1:3, + ldiv_index = 4, + missing = -999 + ) + + # Convert to matrix/df for easier checking if not already + res <- as.data.frame(res[]) + + # Test Case: Group 1 (Rows 1-3) + # Values: 1, 0, 2 -> Distinct should be 3 + expect_equal(res$sensitive_Distinct_Ldiversity[1], 3) + expect_equal(res$sensitive_Distinct_Ldiversity[2], 3) + expect_equal(res$sensitive_Distinct_Ldiversity[3], 3) + + # Test Case: Group 2 (Rows 4-5) + # Values: 1, 0 -> Distinct should be 2 + expect_equal(res$sensitive_Distinct_Ldiversity[4], 2) + expect_equal(res$sensitive_Distinct_Ldiversity[5], 2) + + # Test Case: Group 3 (Rows 6-7) + # Values: 0, 1 -> Distinct should be 2 + expect_equal(res$sensitive_Distinct_Ldiversity[6], 2) + expect_equal(res$sensitive_Distinct_Ldiversity[7], 2) +}) + +test_that("ldiversityWORK maintains original row order", { + # Create a dataset where the sorted order is different from input order + unordered_data <- data.frame( + id = c(3, 1, 2), # If we sort by 'id', order changes + sens = c(10, 20, 10) + ) + + # Group 1: id=3, sens=10 (Size 1, Distinct 1) + # Group 2: id=1, sens=20 (Size 1, Distinct 1) + # Group 3: id=2, sens=10 (Size 1, Distinct 1) + res <- ldiversity( + obj = unordered_data, + keyVars = "id", + ldiv_index = "sens" + ) + res <- as.data.frame(res[]) + + # The result should have 3 rows and match the input rows + expect_equal(nrow(res), 3) + expect_true(all(res[, 1] == 1)) # All groups size 1 in this specific setup +}) + +test_that("ldiversity handles multiple sensitive variables", { + multi_data <- data.frame( + key = c(1, 1, 1), + sens1 = c("A", "B", "A"), # 2 distinct + sens2 = c("X", "Y", "Z") # 3 distinct + ) + + res <- ldiversity(obj = multi_data, keyVars = "key", ldiv_index = c("sens1", "sens2")) + res <- as.data.frame(res[]) + expect_equal(res[1, "sens1_Distinct_Ldiversity"], 2) + expect_equal(res[1, "sens2_Distinct_Ldiversity"], 3) +}) diff --git a/tests/testthat/test_pram.R b/tests/testthat/test_pram.R index cc469f10..d01d34e9 100644 --- a/tests/testthat/test_pram.R +++ b/tests/testthat/test_pram.R @@ -1,395 +1,101 @@ library(sdcMicro) data(testdata) -# test_that("pram on a factor", { -# ## application on a factor-variable -# res <- pram(as.factor(testdata$roof)) -# print(res) -# summary(res) -# expect_false(identical(res$x,res$x_pram)) -# }) -# testdata$roof <- factor(testdata$roof) -# testdata$walls <- factor(testdata$walls) -# testdata$water <- factor(testdata$water) -# test_that("pram on a data.frame", { -# ## application on a data.frame -# ## pram can only be applied to factors, thus we have to recode -# ## to factors before the method can be applied -# -# -# ## pram() is applied within subgroups defined by -# ## variables "urbrur" and "sex" -# res <- pram(testdata, variables="roof", -# strata_variables=c("urbrur","sex")) -# expect_false(identical(res$roof,res$roof_pram)) -# }) -# -# test_that("pram on a data.table", { -# library(data.table) -# testdataDT <- as.data.table(testdata) -# set.seed(1) -# res <- pram(testdataDT, variables="roof", -# strata_variables=c("urbrur","sex")) -# set.seed(1) -# res2 <- pram(testdata, variables="roof", -# strata_variables=c("urbrur","sex")) -# expect_identical(res2$roof_pram,res$roof_pram) -# }) -# -# test_that("pram on a data.frame 2", { -# ## default parameters (pd=0.8 and alpha=0.5) for the generation -# ## of the invariant transition matrix will be used for all variables -# res1 <- pram(testdata, variables=c("roof","walls","water")) -# expect_false(all(res1$roof_pram==testdata$roof&res1$walls_pram==testdata$walls&res1$water_pram==testdata$water)) -# }) -# -# test_that("pram on a data.frame, non default pd", { -# ## specific parameters for each variable -# res1 <- pram(testdata,variables=c("roof","walls","water"), -# pd=c(0.95,0.8,0.9), alpha=0.5) -# expect_false(all(res1$roof_pram==testdata$roof&res1$walls_pram==testdata$walls&res1$water_pram==testdata$water)) -# }) -# -# test_that("pram on a data.frame, full matrix for pd", { -# ## we can also specify a custom transition-matrix directly -# # for variable roof; matrix must have rownames and colnames that match -# # the levels of the variable that should be post-randomized -# # rowSums() and colSums() must equal 1 too! -# mat <- diag(length(levels(testdata$roof))) -# rownames(mat) <- colnames(mat) <- levels(testdata$roof) -# res1 <- pram(testdata,variables="roof", pd=mat) -# expect_true(all(res1$roof_pram==testdata$roof)) -# ## it is possible use a transistion matrix for a variable and use the 'traditional' way -# ## of specifying a number for the minimal diagonal entries of the transision matrix -# ## for other variables. In this case we must supply \\code{pd} as list. -# res1 <- pram(testdata,variables=c("roof","walls"), pd=list(mat,0.5), alpha=c(NA, 0.5)) -# expect_false(all(res1$roof_pram==testdata$roof&res1$walls_pram==testdata$walls)) -# }) -# -# data(testdata2) -# testdata2$urbrur <- factor(testdata2$urbrur) -# test_that("pram on a sdcObj", { -# ## application to objects of class sdcMicro with default parameters -# -# sdc <- createSdcObj(testdata2, -# keyVars=c('roof','walls','water','electcon','relat','sex'), -# numVars=c('expend','income','savings'), w='sampling_weight') -# sdc <- pram(sdc, variables=c("urbrur")) -# expect_true(is.data.table(sdc@pram$comparison$urbrur)) -# }) -# -# -# test_that("pram on a sdcObj 2", { -# ## this is equal to the previous application. If argument 'variables' is NULL, -# ## all variables from slot 'pramVars' will be used if possible. -# sdc <- createSdcObj(testdata2, -# keyVars=c('roof','walls','water','electcon','relat','sex'), -# numVars=c('expend','income','savings'), w='sampling_weight', -# pramVars="urbrur") -# sdc <- pram(sdc) -# expect_true(is.data.table(sdc@pram$comparison$urbrur)) -# }) -# -# test_that("pram on a sdcObj, matrix", { -# ## we can specify transition matrices for sdcMicroObj-objects too -# testdata2$roof <- factor(testdata2$roof) -# sdc <- createSdcObj(testdata2, -# keyVars=c('roof','walls','water','electcon','relat','sex'), -# numVars=c('expend','income','savings'), w='sampling_weight') -# mat <- diag(length(levels(testdata2$roof))) -# rownames(mat) <- colnames(mat) <- levels(testdata2$roof) -# mat[1,] <- c(0.9,0,0,0.05,0.05) -# # we can also have a look at the transitions -# gsm <- get.sdcMicroObj(sdc, "pram")$transitions -# expect_warning(sdc <- pram(sdc, variables="roof", pd=mat)) -# -# }) -# ################################################ -# # data frame -# # data frame without missing -# set.seed(3) -# sample_Data <- runif(20, 1, 100) -# sample_Data <- array(sample_Data, dim = c(10,2)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 2), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 4), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 6), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 3), digits = 0)) -# sample_Data <- as.data.frame(sample_Data) -# colnames(sample_Data) <- c("Var1", "Var2", "Var3", "Var4", "Var5", "Var6") -# sample_Data[,3] <- as.factor(as.numeric(sample_Data[,3])) -# sample_Data[,4] <- as.factor(as.numeric(sample_Data[,4])) -# sample_Data[,5] <- as.factor(as.numeric(sample_Data[,5])) -# sample_Data[,6] <- as.factor(as.numeric(sample_Data[,6])) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5")) -# expect_equal(fr[[7]], structure(c(1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=1, strata_variables=1, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1","2"), class = "factor")) -# }) -# -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6")) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor") -# ) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=2, strata_variables=1, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5")) -# expect_equal(fr[[7]], structure(c(1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=2, strata_variables=1, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=2, strata_variables=2, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6")) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=2, strata_variables=2, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# # data frame with missing -# set.seed(3) -# sample_Data <- runif(20, 1, 100) -# sample_Data <- array(sample_Data, dim = c(10,2)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 2), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 4), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 6), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 3), digits = 0)) -# sample_Data <- as.data.frame(sample_Data) -# colnames(sample_Data) <- c("Var1", "Var2", "Var3", "Var4", "Var5", "Var6") -# sample_Data[,3] <- as.factor(as.numeric(sample_Data[,3])) -# sample_Data[,4] <- as.factor(as.numeric(sample_Data[,4])) -# sample_Data[,5] <- as.factor(as.numeric(sample_Data[,5])) -# sample_Data[,6] <- as.factor(as.numeric(sample_Data[,6])) -# sample_Data[c(1,3,4,9),1] <- NA -# sample_Data[c(3,10),2] <- NA -# sample_Data[c(3),3] <- NA -# sample_Data[c(1,3,6),4] <- NA -# sample_Data[c(2,3,7,10),5] <- NA -# sample_Data[c(3,8),6] <- NA -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5")) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=1, strata_variables=1, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# test_that("data frame without missing, variables=1, strata_variables=2, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6")) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=1, strata_variables=2, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5", "Var6"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5")) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=1, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6")) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = default, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = different, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), pd = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# test_that("data frame without missing, variables=2, strata_variables=2, pd = different, alpha = different)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var3", "Var4"), strata_variables=c("Var5", "Var6"), pd = 0.1, alpha = 0.1) -# expect_equal(fr[[7]], structure(c(1L, 1L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# -# -# -# # data frame with more columns then rows with missing -# set.seed(3) -# sample_Data <- runif(20, 1, 100) -# sample_Data <- array(sample_Data, dim = c(4,5)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 2), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 4), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 6), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 3), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 5), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(4, 1, 2), digits = 0)) -# sample_Data <- as.data.frame(sample_Data) -# colnames(sample_Data) <- c("Var1", "Var2", "Var3", "Var4", "Var5", "Var6", "Var7", "Var8", "Var9", "Var10", "Var11") -# sample_Data[,6] <- as.factor(as.numeric(sample_Data[,6])) -# sample_Data[,7] <- as.factor(as.numeric(sample_Data[,7])) -# sample_Data[,8] <- as.factor(as.numeric(sample_Data[,8])) -# sample_Data[,9] <- as.factor(as.numeric(sample_Data[,9])) -# sample_Data[,10] <- as.factor(as.numeric(sample_Data[,10])) -# sample_Data[,11] <- as.factor(as.numeric(sample_Data[,11])) -# sample_Data[c(1,3,4),6] <- NA -# sample_Data[c(2,3),7] <- NA -# sample_Data[c(3),8] <- NA -# sample_Data[c(1,3),9] <- NA -# sample_Data[c(2,3),10] <- NA -# sample_Data[c(3),11] <- NA -# -# test_that("data frame without missing, variables=3, strata_variables=3, pd = default, alpha = default)", { -# set.seed(123) -# fr <- pram(sample_Data, variables = c("Var6", "Var7", "Var8"), strata_variables=c("Var9", "Var10", "Var11")) -# expect_equal(fr[[12]], structure(c(NA, 1L, NA, NA), .Label = "1", class = "factor")) -# expect_equal(fr[[13]], structure(c(1L, NA, NA, 3L), .Label = c("2", "3", "4"), class = "factor")) -# expect_equal(fr[[14]], structure(c(2L, 3L, NA, 3L), .Label = c("3", "4", "5"), class = "factor")) -# }) -# -# -# -# # sdc -# # sdc without missing values -# set.seed(3) -# sample_Data <- runif(20, 1, 100) -# sample_Data <- array(sample_Data, dim = c(10,2)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 2), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 4), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 6), digits = 0)) -# sample_Data <- cbind(sample_Data, round(runif(10, 1, 3), digits = 0)) -# sample_Data <- as.data.frame(sample_Data) -# colnames(sample_Data) <- c("Var1", "Var2", "Var3", "Var4", "Var5", "Var6") -# sample_Data[,3] <- as.factor(as.numeric(sample_Data[,3])) -# sample_Data[,4] <- as.factor(as.numeric(sample_Data[,4])) -# sample_Data[,5] <- as.factor(as.numeric(sample_Data[,5])) -# sample_Data[,6] <- as.factor(as.numeric(sample_Data[,6])) -# sample_Data <- createSdcObj(sample_Data, keyVars=c("Var1", "Var2")) -# -# test_that("data frame without missing, variables=1, strata_variables=1, pd = default, alpha = default)", { -# set.seed(123) -# sample_Data <- pram(sample_Data, variables = c("Var3"), strata_variables=c("Var5")) -# expect_equal(sample_Data@manipPramVars[,1], structure(c(1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("1", "2"), class = "factor")) -# }) -# +testdata_pram <- function(n = 100, missing = FALSE) { + set.seed(3) + df <- data.frame( + Var1 = runif(n, 1, 100), + Var2 = runif(n, 1, 100), + Var3 = as.factor(round(runif(n, 1, 2))), + Var4 = as.factor(round(runif(n, 1, 4))), + Var5 = as.factor(round(runif(n, 1, 6))), + Var6 = as.factor(round(runif(n, 1, 3))) + ) + if (missing) { + df[c(1, 3), 3] <- NA + df[c(2, 5), 5] <- NA + } + return(df) +} + +test_that("pram handles various input classes consistently", { + data(testdata) + testdata$roof <- factor(testdata$roof) + v <- "roof" + strata <- c("urbrur", "sex") + + # Factor input + expect_is(testdata[[v]], "factor") + res_f <- pram(testdata[[v]]) + expect_s3_class(res_f, "pram") + expect_false(identical(res_f$x, res_f$x_pram)) + + # data.table vs data.frame consistency + library(data.table) + dt <- as.data.table(testdata) + + set.seed(1) + res_dt <- pram(dt, variables = v, strata_variables = strata) + set.seed(1) + res_df <- pram(testdata, variables = v, strata_variables = strata) + + expect_identical(res_dt$roof_pram, res_df$roof_pram) +}) + + +test_that("pram responds correctly to pd and alpha changes", { + sample_Data <- testdata_pram() + + # Test combinations + set.seed(123) + def <- pram(sample_Data, variables = "Var3", strata_variables = "Var5") + + set.seed(123) + low_alpha <- pram(sample_Data, variables = "Var3", strata_variables = "Var5", alpha = 0.1) + + set.seed(123) + low_pd <- pram(sample_Data, variables = "Var3", strata_variables = "Var5", pd = 0.1) + + # Assertions: results should differ when parameters change + expect_false(identical(def$Var3_pram, low_alpha$Var3_pram)) + expect_false(identical(def$Var3_pram, low_pd$Var3_pram)) +}) + +test_that("pram respects identity and custom transition matrices", { + data(testdata) + testdata$roof <- as.factor(testdata$roof) + testdata$walls <- as.factor(testdata$walls) + lvl <- levels(testdata$roof) + + # Identity matrix should result in NO change + mat_id <- diag(length(lvl)) + rownames(mat_id) <- colnames(mat_id) <- lvl + + res <- pram(testdata, variables = "roof", pd = mat_id) + expect_identical(res$roof, res$roof_pram) + + # List of matrices for multiple variables + res_multi <- pram(testdata, variables = c("roof", "walls"), + pd = list(mat_id, 0.5), alpha = c(NA, 0.5)) + expect_identical(res_multi$roof, res_multi$roof_pram) # Roof stays same + expect_false(all(res_multi$walls == res_multi$walls_pram)) # Walls change +}) + +test_that("pram integrates with sdcMicro objects", { + data(testdata2) + testdata2$urbrur <- as.factor(testdata2$urbrur) + + sdc <- createSdcObj( + dat = testdata2, + keyVars = c('roof', 'walls'), + pramVars = "urbrur", + w = 'sampling_weight' + ) + + sdc <- pram(sdc) + + # Check if the pram slot was populated correctly + pram_info <- get.sdcMicroObj(sdc, "pram") + expect_true("urbrur" %in% names(pram_info$comparison)) + expect_s3_class(pram_info$comparison$urbrur, "data.table") +})