• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/Rscript
2# Copyright 2014 Google Inc. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16library(RUnit)
17library(abind)
18
19source('analysis/R/decode.R')
20source('tests/gen_counts.R')
21
22L1Distance <- function(X, Y) {
23  # Computes the L1 distance between two named vectors
24  common <- intersect(names(X), names(Y))
25
26  L1_intersect <- sum(abs(X[common] - Y[common]))
27  L1_X_minus_Y <- sum(X[!names(X) %in% common])
28  L1_Y_minus_X <- sum(Y[!names(Y) %in% common])
29
30  (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2
31}
32
33LInfDistance <- function(X, Y) {
34  # Computes the L_infinity distance between two named vectors
35  common <- intersect(names(X), names(Y))
36
37  max(abs(X[common] - Y[common]),
38      abs(X[!names(X) %in% common]),
39      abs(Y[!names(Y) %in% common]))
40}
41
42MatrixVectorMerge <- function(mat, vec) {
43  # Attaches a vector to a matrix, matching corresponding column names
44
45  mat_only <- setdiff(colnames(mat), names(vec))
46  vec_only <- setdiff(names(vec), colnames(mat))
47
48  # extend the vector with missing columns
49  vec_long <- c(vec, setNames(rep(NA, length(mat_only)), mat_only))
50
51  # extend the matrix with missing columns
52  newcols <- matrix(NA, nrow = nrow(mat), ncol = length(vec_only))
53  colnames(newcols) <- vec_only
54  mat_long <- cbind(mat, newcols)
55
56  # Now vec and mat have the same columns, but in the wrong order. Sort the
57  # columns lexicographically.
58  if(length(vec_long) > 0) {
59    mat_long <- mat_long[, order(colnames(mat_long)), drop = FALSE]
60    vec_long <- vec_long[order(names(vec_long))]
61  }
62
63  rbind(mat_long, vec_long)
64}
65
66RunMultipleTests <- function(title, fun, repetitions, ...) {
67  # Run a function with an annotated progress indicator. The function's outputs
68  # are concatenated and returned as a list of length repetitions.
69  cat(title, ": ")
70
71  if(repetitions == 1) {
72    # only run once
73    results <- list(fun(...))
74
75    cat(" Done.\n")
76  } else {  # run multiple times
77    pb <- txtProgressBar(min = 0, max = repetitions,
78                         width = getOption("width") - 20 - nchar(title))
79
80    results <- vector(mode = "list", repetitions)
81    for(i in 1:repetitions) {
82      setTxtProgressBar(pb, i)
83      results[[i]] <- fun(...)
84    }
85    cat(" Done.")
86    close(pb)
87  }
88
89  results
90}
91
92CheckEstimatesAndStdsHelper <- function(params, map, pdf, total) {
93  # Helper function for TestEstimateBloomCounts.
94  partition <- RandomPartition(total, pdf)
95  counts <- GenerateCounts(params, map, partition, 1)
96
97  EstimateBloomCounts(params, counts)
98}
99
100CheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
101  # Checks that the expectations returned by EstimateBloomCounts on simulated
102  # inputs match the ground truth and the empirical standard deviation matches
103  # EstimateBloomCounts outputs.
104  #
105  # Input:
106  #   repetitions: the number of runs ofEstimateBloomCounts
107  #   title: label
108  #   params: params vector
109  #   map: the map table
110  #   pdf: probability density function of the distribution from which simulated
111  #        clients are sampled
112  #   total: number of reports
113
114  results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions,
115                              params, map, pdf, total)
116
117  estimates <- abind(lapply(results, function(r) r$estimates), along = 3)
118  stds <- abind(lapply(results, function(r) r$stds), along = 3)
119
120  ave_e <- apply(estimates, 1:2, mean)
121  observed_stds <- apply(estimates, 1:2, sd)
122  ave_stds <- apply(stds, 1:2, mean)
123
124  ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE)
125
126  checkTrue(!any(abs(ave_e - ground_truth) > 1E-9 +  # tolerance level
127                                             (ave_stds / repetitions^.5) * 5),
128              "Averages deviate too much from expectations.")
129
130  checkTrue(!any(observed_stds > ave_stds * (1 + 5 * repetitions^.5)),
131            "Expected standard deviations are too high")
132
133  checkTrue(!any(observed_stds < ave_stds * (1 - 5 * repetitions^.5)),
134            "Expected standard deviations are too low")
135}
136
137TestEstimateBloomCounts <- function() {
138  # Unit tests for the EstimateBloomCounts function.
139
140  report4x2 <- list(k = 4, m = 2)  # 2 cohorts, 4 bits each
141  map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE)  # 3 possible values
142  map0[1,] <- c(1, 0, 0)
143  map0[2,] <- c(0, 1, 0)
144  map0[3,] <- c(0, 0, 1)
145  map0[4,] <- c(1, 1, 1)  # 4th bit of the first cohort gets signal from all
146  map0[5,] <- c(0, 0, 1)  # 1st bit of the second cohort gets signal from v3
147
148  colnames(map0) <- c('v1', 'v2', 'v3')
149
150  pdf0 <- c(1/2, 1/3, 1/6)
151  names(pdf0) <- colnames(map0)
152
153  noise0 <- list(p = 0, q = 1, f = 0)  # no noise at all
154
155  CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)",
156                        c(report4x2, noise0), map0, pdf0, 100)
157
158  noise1 <- list(p = 0.4, q = .6, f = 0.5)
159  CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)",
160                        c(report4x2, noise1), map0, pdf0, 100)
161
162  # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
163  values <- 100
164
165  report8x32 <- list(k = 8, m = 32)  # 32 cohorts, 8 bits each
166
167  map1 <- matrix(rbinom(32 * 8 * values, 1, .25), nrow = 32 * 8, ncol = values)
168
169  colnames(map1) <- sprintf("v%d", 1:values)
170
171  pdf1 <- ComputePdf("zipf1", values)
172
173  CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)",
174                        c(report8x32, noise1), map1, pdf1, 10^9)
175}
176
177CheckDecodeHelper <- function(params, map, pdf, num_clients,
178                             tolerance_l1, tolerance_linf) {
179  # Helper function for TestDecode. Simulates a RAPPOR run and checks results of
180  # Decode's output against the ground truth. Output is returned as a list.
181
182  partition <- RandomPartition(num_clients, pdf)
183  counts <- GenerateCounts(params, map, partition, 1)
184  total <- sum(partition)
185
186  decoded <- Decode(counts, map, params, quiet = TRUE)
187
188  decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string)
189
190  checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1,
191            sprintf("L1 distance is too large: \
192                    L1Distance = %f, total^0.5 * tolerance_l1 = %f",
193                    L1Distance(decoded_partition, partition),
194                    total^0.5 * tolerance_l1))
195
196  checkTrue(LInfDistance(decoded_partition, partition) <
197              max(partition)^.5 * tolerance_linf,
198              sprintf("L_inf distance is too large: \
199                      L1Distance = %f, max(partition)^0.5 * tolerance_linf = %f",
200                      L1Distance(decoded_partition, partition),
201                      max(partition)^0.5 * tolerance_linf))
202
203  list(estimates = decoded_partition,
204       stds = setNames(decoded$fit$std_error, decoded$fit$string))
205}
206
207CheckDecodeAveAndStds <- function(...) {
208  # Runs Decode multiple times (specified by the repetition argument), checks
209  # individuals runs against the ground truth, and the estimates of the standard
210  # error against empirical observations.
211
212  results <- RunMultipleTests(...)
213
214  estimates <- matrix(nrow = 0, ncol = 0)
215  lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))
216
217  stds <- matrix(nrow = 0, ncol = 0)
218  lapply(results, function(r) MatrixVectorMerge(stds, r$stds))
219
220  empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
221  estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
222
223  if(dim(estimates)[1] > 1) {
224    checkTrue(any(estimated_stds > empirical_stds / 2),
225              "Our estimate for the standard deviation is too low")
226
227    checkTrue(any(estimated_stds < empirical_stds * 3),
228              "Our estimate for the standard deviation is too high")
229  }
230}
231
232TestDecode <- function() {
233  # Unit tests for the Decode function.
234
235  # TOY TESTS: three values, 2 cohorts, 4 bits each
236
237  params_4x2 <- list(k = 4, m = 2, h = 2)  # 2 cohorts, 4 bits each
238  map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE)  # 3 possible values
239  map0[1,] <- c(1, 0, 0)
240  map0[2,] <- c(0, 1, 0)
241  map0[3,] <- c(0, 0, 1)
242  map0[4,] <- c(1, 1, 1)  # 4th bit of the first cohort gets signal from all
243  map0[5,] <- c(0, 0, 1)  # 1st bit of the second cohort gets signal from v3
244
245  colnames(map0) <- c('v1', 'v2', 'v3')
246  distribution0 <- setNames(c(1/2, 1/3, 1/6),  colnames(map0))
247
248  # Even in the absence of noise, the inferred counts won't necessarily
249  # match the ground truth. Must be close enough though.
250  noise0 <- list(p = 0, q = 1, f = 0)  # no noise whatsoever
251
252  # Args are: message str, test function, # repetitions,
253  #           params, map, true pdf, # clients,
254  #           tolerances
255  CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100,
256                        c(params_4x2, noise0), map0, distribution0, 100,
257                        tolerance_l1 = 5,
258                        tolerance_linf = 3)
259
260  noise1 <- list(p = .4, q = .6, f = .5)  # substantial noise, very few reports
261  CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100,
262                        c(params_4x2, noise1), map0, distribution0, 100,
263                        tolerance_l1 = 20,
264                        tolerance_linf = 20)
265
266  # substantial noise, many reports
267  CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100,
268                        c(params_4x2, noise1), map0, distribution0, 100000,
269                        tolerance_l1 = 50,
270                        tolerance_linf = 40)
271
272  # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
273  num_values <- 100
274
275  params_8x32 <- list(k = 8, m = 32, h = 2)  # 32 cohorts, 8 bits each
276
277  map1 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol =
278                 num_values)
279
280  colnames(map1) <- sprintf("v%d", 1:num_values)
281
282  distribution1 <- ComputePdf("zipf1", num_values)
283  names(distribution1) <- colnames(map1)
284  CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100,
285                        c(params_8x32, noise1), map1, distribution1, 10^6,
286                        tolerance_l1 = num_values * 3,
287                        tolerance_linf = 100)
288
289  # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports
290  num_values <- 500
291
292  params_8x32 <- list(k = 8, m = 32, h = 2)  # 32 cohorts, 8 bits each
293
294  map2 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol =
295                 num_values)
296
297  colnames(map2) <- sprintf("v%d", 1:num_values)
298
299  distribution2 <- ComputePdf("zipf1.5", num_values)
300  names(distribution2) <- colnames(map2)
301
302  CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1,
303                        c(params_8x32, noise1), map2, distribution2, 10^6,
304                        tolerance_l1 = num_values * 3,
305                        tolerance_linf = 80)
306
307}
308
309TestDecodeBool <- function() {
310  # Testing Boolean Decode
311  num_values <- 2
312  # 1 bit; rest of the params don't matter
313  params_bool <- list(k = 1, m = 128, h = 2)
314  # setting up map_bool to be consistent with the Decode API and for
315  # GenerateCounts()
316  map_bool <- matrix(c(0, 1), nrow = 128 * 1, ncol = num_values, byrow = TRUE)
317
318  colnames(map_bool) <- c("FALSE", "TRUE")
319  distribution_bool <- ComputePdf("zipf1.5", num_values)
320  names(distribution_bool) <- colnames(map_bool)
321  noise2 <- list(p = 0.25, q = 0.75, f = 0.5)
322
323  # tolerance_l1 set to four standard deviations to avoid any flakiness in
324  # tests
325  CheckDecodeAveAndStds("Testing .DecodeBoolean (1/3)", CheckDecodeHelper, 100,
326                        c(params_bool, noise2), map_bool, distribution_bool,
327                        10^6,
328                        tolerance_l1 = 4 * num_values,
329                        tolerance_linf = 80)
330
331  noise1 <- list(p = .4, q = .6, f = .5)  # substantial noise => 7 stddevs error
332  CheckDecodeAveAndStds("Testing .DecodeBoolean (2/3)", CheckDecodeHelper, 100,
333                        c(params_bool, noise1), map_bool, distribution_bool,
334                        10^6,
335                        tolerance_l1 = 7 * num_values,
336                        tolerance_linf = 80)
337
338  distribution_near_zero <- c(0.999, 0.001)
339  names(distribution_near_zero) <- colnames(map_bool)
340
341  CheckDecodeAveAndStds("Testing .DecodeBoolean (3/3)", CheckDecodeHelper, 100,
342                        c(params_bool, noise2), map_bool,
343                        distribution_near_zero, 10^6,
344                        tolerance_l1 = 4 * num_values,
345                        tolerance_linf = 80)
346}
347
348RunAll <- function() {
349  TestEstimateBloomCounts()
350  TestDecode()
351  TestDecodeBool()
352}
353
354RunAll()
355