1#!/usr/bin/Rscript 2# Copyright 2014 Google Inc. All rights reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16library(RUnit) 17library(abind) 18 19source('analysis/R/decode.R') 20source('tests/gen_counts.R') 21 22L1Distance <- function(X, Y) { 23 # Computes the L1 distance between two named vectors 24 common <- intersect(names(X), names(Y)) 25 26 L1_intersect <- sum(abs(X[common] - Y[common])) 27 L1_X_minus_Y <- sum(X[!names(X) %in% common]) 28 L1_Y_minus_X <- sum(Y[!names(Y) %in% common]) 29 30 (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2 31} 32 33LInfDistance <- function(X, Y) { 34 # Computes the L_infinity distance between two named vectors 35 common <- intersect(names(X), names(Y)) 36 37 max(abs(X[common] - Y[common]), 38 abs(X[!names(X) %in% common]), 39 abs(Y[!names(Y) %in% common])) 40} 41 42MatrixVectorMerge <- function(mat, vec) { 43 # Attaches a vector to a matrix, matching corresponding column names 44 45 mat_only <- setdiff(colnames(mat), names(vec)) 46 vec_only <- setdiff(names(vec), colnames(mat)) 47 48 # extend the vector with missing columns 49 vec_long <- c(vec, setNames(rep(NA, length(mat_only)), mat_only)) 50 51 # extend the matrix with missing columns 52 newcols <- matrix(NA, nrow = nrow(mat), ncol = length(vec_only)) 53 colnames(newcols) <- vec_only 54 mat_long <- cbind(mat, newcols) 55 56 # Now vec and mat have the same columns, but in the wrong order. Sort the 57 # columns lexicographically. 58 if(length(vec_long) > 0) { 59 mat_long <- mat_long[, order(colnames(mat_long)), drop = FALSE] 60 vec_long <- vec_long[order(names(vec_long))] 61 } 62 63 rbind(mat_long, vec_long) 64} 65 66RunMultipleTests <- function(title, fun, repetitions, ...) { 67 # Run a function with an annotated progress indicator. The function's outputs 68 # are concatenated and returned as a list of length repetitions. 69 cat(title, ": ") 70 71 if(repetitions == 1) { 72 # only run once 73 results <- list(fun(...)) 74 75 cat(" Done.\n") 76 } else { # run multiple times 77 pb <- txtProgressBar(min = 0, max = repetitions, 78 width = getOption("width") - 20 - nchar(title)) 79 80 results <- vector(mode = "list", repetitions) 81 for(i in 1:repetitions) { 82 setTxtProgressBar(pb, i) 83 results[[i]] <- fun(...) 84 } 85 cat(" Done.") 86 close(pb) 87 } 88 89 results 90} 91 92CheckEstimatesAndStdsHelper <- function(params, map, pdf, total) { 93 # Helper function for TestEstimateBloomCounts. 94 partition <- RandomPartition(total, pdf) 95 counts <- GenerateCounts(params, map, partition, 1) 96 97 EstimateBloomCounts(params, counts) 98} 99 100CheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) { 101 # Checks that the expectations returned by EstimateBloomCounts on simulated 102 # inputs match the ground truth and the empirical standard deviation matches 103 # EstimateBloomCounts outputs. 104 # 105 # Input: 106 # repetitions: the number of runs ofEstimateBloomCounts 107 # title: label 108 # params: params vector 109 # map: the map table 110 # pdf: probability density function of the distribution from which simulated 111 # clients are sampled 112 # total: number of reports 113 114 results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions, 115 params, map, pdf, total) 116 117 estimates <- abind(lapply(results, function(r) r$estimates), along = 3) 118 stds <- abind(lapply(results, function(r) r$stds), along = 3) 119 120 ave_e <- apply(estimates, 1:2, mean) 121 observed_stds <- apply(estimates, 1:2, sd) 122 ave_stds <- apply(stds, 1:2, mean) 123 124 ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE) 125 126 checkTrue(!any(abs(ave_e - ground_truth) > 1E-9 + # tolerance level 127 (ave_stds / repetitions^.5) * 5), 128 "Averages deviate too much from expectations.") 129 130 checkTrue(!any(observed_stds > ave_stds * (1 + 5 * repetitions^.5)), 131 "Expected standard deviations are too high") 132 133 checkTrue(!any(observed_stds < ave_stds * (1 - 5 * repetitions^.5)), 134 "Expected standard deviations are too low") 135} 136 137TestEstimateBloomCounts <- function() { 138 # Unit tests for the EstimateBloomCounts function. 139 140 report4x2 <- list(k = 4, m = 2) # 2 cohorts, 4 bits each 141 map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values 142 map0[1,] <- c(1, 0, 0) 143 map0[2,] <- c(0, 1, 0) 144 map0[3,] <- c(0, 0, 1) 145 map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all 146 map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3 147 148 colnames(map0) <- c('v1', 'v2', 'v3') 149 150 pdf0 <- c(1/2, 1/3, 1/6) 151 names(pdf0) <- colnames(map0) 152 153 noise0 <- list(p = 0, q = 1, f = 0) # no noise at all 154 155 CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)", 156 c(report4x2, noise0), map0, pdf0, 100) 157 158 noise1 <- list(p = 0.4, q = .6, f = 0.5) 159 CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)", 160 c(report4x2, noise1), map0, pdf0, 100) 161 162 # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports 163 values <- 100 164 165 report8x32 <- list(k = 8, m = 32) # 32 cohorts, 8 bits each 166 167 map1 <- matrix(rbinom(32 * 8 * values, 1, .25), nrow = 32 * 8, ncol = values) 168 169 colnames(map1) <- sprintf("v%d", 1:values) 170 171 pdf1 <- ComputePdf("zipf1", values) 172 173 CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)", 174 c(report8x32, noise1), map1, pdf1, 10^9) 175} 176 177CheckDecodeHelper <- function(params, map, pdf, num_clients, 178 tolerance_l1, tolerance_linf) { 179 # Helper function for TestDecode. Simulates a RAPPOR run and checks results of 180 # Decode's output against the ground truth. Output is returned as a list. 181 182 partition <- RandomPartition(num_clients, pdf) 183 counts <- GenerateCounts(params, map, partition, 1) 184 total <- sum(partition) 185 186 decoded <- Decode(counts, map, params, quiet = TRUE) 187 188 decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string) 189 190 checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1, 191 sprintf("L1 distance is too large: \ 192 L1Distance = %f, total^0.5 * tolerance_l1 = %f", 193 L1Distance(decoded_partition, partition), 194 total^0.5 * tolerance_l1)) 195 196 checkTrue(LInfDistance(decoded_partition, partition) < 197 max(partition)^.5 * tolerance_linf, 198 sprintf("L_inf distance is too large: \ 199 L1Distance = %f, max(partition)^0.5 * tolerance_linf = %f", 200 L1Distance(decoded_partition, partition), 201 max(partition)^0.5 * tolerance_linf)) 202 203 list(estimates = decoded_partition, 204 stds = setNames(decoded$fit$std_error, decoded$fit$string)) 205} 206 207CheckDecodeAveAndStds <- function(...) { 208 # Runs Decode multiple times (specified by the repetition argument), checks 209 # individuals runs against the ground truth, and the estimates of the standard 210 # error against empirical observations. 211 212 results <- RunMultipleTests(...) 213 214 estimates <- matrix(nrow = 0, ncol = 0) 215 lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates)) 216 217 stds <- matrix(nrow = 0, ncol = 0) 218 lapply(results, function(r) MatrixVectorMerge(stds, r$stds)) 219 220 empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE) 221 estimated_stds <- apply(stds, 2, mean, na.rm = TRUE) 222 223 if(dim(estimates)[1] > 1) { 224 checkTrue(any(estimated_stds > empirical_stds / 2), 225 "Our estimate for the standard deviation is too low") 226 227 checkTrue(any(estimated_stds < empirical_stds * 3), 228 "Our estimate for the standard deviation is too high") 229 } 230} 231 232TestDecode <- function() { 233 # Unit tests for the Decode function. 234 235 # TOY TESTS: three values, 2 cohorts, 4 bits each 236 237 params_4x2 <- list(k = 4, m = 2, h = 2) # 2 cohorts, 4 bits each 238 map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values 239 map0[1,] <- c(1, 0, 0) 240 map0[2,] <- c(0, 1, 0) 241 map0[3,] <- c(0, 0, 1) 242 map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all 243 map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3 244 245 colnames(map0) <- c('v1', 'v2', 'v3') 246 distribution0 <- setNames(c(1/2, 1/3, 1/6), colnames(map0)) 247 248 # Even in the absence of noise, the inferred counts won't necessarily 249 # match the ground truth. Must be close enough though. 250 noise0 <- list(p = 0, q = 1, f = 0) # no noise whatsoever 251 252 # Args are: message str, test function, # repetitions, 253 # params, map, true pdf, # clients, 254 # tolerances 255 CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100, 256 c(params_4x2, noise0), map0, distribution0, 100, 257 tolerance_l1 = 5, 258 tolerance_linf = 3) 259 260 noise1 <- list(p = .4, q = .6, f = .5) # substantial noise, very few reports 261 CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100, 262 c(params_4x2, noise1), map0, distribution0, 100, 263 tolerance_l1 = 20, 264 tolerance_linf = 20) 265 266 # substantial noise, many reports 267 CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100, 268 c(params_4x2, noise1), map0, distribution0, 100000, 269 tolerance_l1 = 50, 270 tolerance_linf = 40) 271 272 # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports 273 num_values <- 100 274 275 params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each 276 277 map1 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol = 278 num_values) 279 280 colnames(map1) <- sprintf("v%d", 1:num_values) 281 282 distribution1 <- ComputePdf("zipf1", num_values) 283 names(distribution1) <- colnames(map1) 284 CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100, 285 c(params_8x32, noise1), map1, distribution1, 10^6, 286 tolerance_l1 = num_values * 3, 287 tolerance_linf = 100) 288 289 # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports 290 num_values <- 500 291 292 params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each 293 294 map2 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol = 295 num_values) 296 297 colnames(map2) <- sprintf("v%d", 1:num_values) 298 299 distribution2 <- ComputePdf("zipf1.5", num_values) 300 names(distribution2) <- colnames(map2) 301 302 CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1, 303 c(params_8x32, noise1), map2, distribution2, 10^6, 304 tolerance_l1 = num_values * 3, 305 tolerance_linf = 80) 306 307} 308 309TestDecodeBool <- function() { 310 # Testing Boolean Decode 311 num_values <- 2 312 # 1 bit; rest of the params don't matter 313 params_bool <- list(k = 1, m = 128, h = 2) 314 # setting up map_bool to be consistent with the Decode API and for 315 # GenerateCounts() 316 map_bool <- matrix(c(0, 1), nrow = 128 * 1, ncol = num_values, byrow = TRUE) 317 318 colnames(map_bool) <- c("FALSE", "TRUE") 319 distribution_bool <- ComputePdf("zipf1.5", num_values) 320 names(distribution_bool) <- colnames(map_bool) 321 noise2 <- list(p = 0.25, q = 0.75, f = 0.5) 322 323 # tolerance_l1 set to four standard deviations to avoid any flakiness in 324 # tests 325 CheckDecodeAveAndStds("Testing .DecodeBoolean (1/3)", CheckDecodeHelper, 100, 326 c(params_bool, noise2), map_bool, distribution_bool, 327 10^6, 328 tolerance_l1 = 4 * num_values, 329 tolerance_linf = 80) 330 331 noise1 <- list(p = .4, q = .6, f = .5) # substantial noise => 7 stddevs error 332 CheckDecodeAveAndStds("Testing .DecodeBoolean (2/3)", CheckDecodeHelper, 100, 333 c(params_bool, noise1), map_bool, distribution_bool, 334 10^6, 335 tolerance_l1 = 7 * num_values, 336 tolerance_linf = 80) 337 338 distribution_near_zero <- c(0.999, 0.001) 339 names(distribution_near_zero) <- colnames(map_bool) 340 341 CheckDecodeAveAndStds("Testing .DecodeBoolean (3/3)", CheckDecodeHelper, 100, 342 c(params_bool, noise2), map_bool, 343 distribution_near_zero, 10^6, 344 tolerance_l1 = 4 * num_values, 345 tolerance_linf = 80) 346} 347 348RunAll <- function() { 349 TestEstimateBloomCounts() 350 TestDecode() 351 TestDecodeBool() 352} 353 354RunAll() 355