• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env Rscript
2#
3# Copyright 2014 Google Inc. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17# Simple tool that wraps the analysis/R library.
18#
19# To run this you need:
20# - ggplot
21# - optparse
22# - glmnet -- dependency of analysis library
23
24library(optparse)
25
26# For unit tests
27is_main <- (length(sys.frames()) == 0)
28
29# Do command line parsing first to catch errors.  Loading libraries in R is
30# slow.
31if (is_main) {
32  option_list <- list(
33     make_option(c("-t", "--title"), help="Plot Title")
34     )
35  parsed <- parse_args(OptionParser(option_list = option_list),
36                       positional_arguments = 3)  # input and output
37}
38
39library(ggplot2)
40
41# Use CairoPNG if available.  Useful for headless R.
42if (library(Cairo, quietly = TRUE, logical.return = TRUE)) {
43  png_func = CairoPNG
44  cat('Using CairoPNG\n')
45} else {
46  png_func = png
47  cat('Using png\n')
48}
49
50source("analysis/R/read_input.R")
51source("analysis/R/decode.R")
52source("analysis/R/util.R")
53
54source("analysis/R/alternative.R")  # temporary
55
56LoadContext <- function(prefix_case) {
57  # Creates the context, filling it with privacy parameters
58  # Arg:
59  #    prefix_case: path prefix to the test case, e.g. '_tmp/exp'
60
61  p <- paste0(prefix_case, '_params.csv')
62
63  params <- ReadParameterFile(p)
64
65  ctx <- new.env()
66
67  ctx$params <- params  # so we can write it out later
68
69  ctx
70}
71
72RunRappor <- function(prefix_case, prefix_instance, ctx) {
73  # Reads counts, map files, runs RAPPOR analysis engine.
74  # Args:
75  #    prefix_case: path prefix to the test case, e.g., '_tmp/exp'
76  #    prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1'
77  #    ctx: context file with params field filled in
78
79  c <- paste0(prefix_instance, '_counts.csv')
80  counts <- ReadCountsFile(c, ctx$params)
81
82  m <- paste0(prefix_case, '_map.csv')
83
84  # Switch to LoadMapFile if want to cache the result
85  map <- ReadMapFile(m, ctx$params)
86
87  # Main decode.R API
88  timing <- system.time({
89    res <- Decode(counts, map$map, ctx$params)
90  })
91
92  # The line is searched for, and elapsed time is extracted, by make_summary.py.
93  # Should the formating or wording change, make_summary must be updated too.
94  Log("Inference took %.3f seconds", timing[["elapsed"]])
95
96  if (is.null(res)) {
97    stop("RAPPOR analysis failed.")
98  }
99
100  Log("Decoded results:")
101  str(res$fit)
102
103  res$fit
104}
105
106LoadActual <- function(prefix_instance) {
107  hist_path <- paste0(prefix_instance, '_hist.csv')  # case.csv
108
109  # gen_counts.R (fast_counts mode) outputs this, since we never have true
110  # client values.
111  if (file.exists(hist_path)) {
112    return(read.csv(hist_path))
113  }
114
115  # Load ground truth into context
116  input_path <- paste0(prefix_instance, '_true_values.csv')  # case.csv
117  client_values <- read.csv(input_path)
118
119  # Create a histogram, or R "table".  Column 2 is the true value.
120  t <- table(client_values$value)
121
122  d <- as.data.frame(t)  # convert it to a data frame with 'string' and 'count' columns
123  colnames(d) <- c('string', 'count')
124
125  d  # return this data frame
126}
127
128CompareRapporVsActual <- function(ctx) {
129  # Prepare input data to be plotted
130
131  actual <- ctx$actual  # from the ground truth file
132  rappor <- ctx$rappor  # from output of AnalyzeRAPPOR
133
134  # "s12" -> 12, for graphing
135  StringToInt <- function(x) as.integer(substring(x, 2))
136
137  actual_values <- StringToInt(actual$string)
138  rappor_values <- StringToInt(rappor$string)
139
140  # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it
141  # occurs too rarely)
142  actual_only <- setdiff(actual_values, rappor_values)
143
144  # False positives: AnalyzeRAPPOR attributed a proportion to a string in the
145  # map that wasn't in the true input.
146  rappor_only <- setdiff(rappor_values, actual_values)
147
148  total <- sum(actual$count)
149  a <- data.frame(index = actual_values,
150                  # Calculate the true proportion
151                  proportion = actual$count / total,
152                  dist = "actual")
153
154  r <- data.frame(index = rappor_values,
155                  proportion = rappor$proportion,
156                  dist = rep("rappor", length(rappor_values)))
157
158  # Extend a and r with the values that they are missing.
159  if (length(rappor_only) > 0) {
160    z <- data.frame(index = rappor_only,
161                    proportion = 0.0,
162                    dist = "actual")
163    a <- rbind(a, z)
164  }
165  if (length(actual_only) > 0) {
166    z <- data.frame(index = actual_only,
167                    proportion = 0.0,
168                    dist = "rappor")
169    r <- rbind(r, z)
170  }
171
172  # IMPORTANT: Now a and r have the same rows, but in the wrong order.  Sort by index.
173  a <- a[order(a$index), ]
174  r <- r[order(r$index), ]
175
176  # L1 distance between actual and rappor distributions
177  l1 <- sum(abs(a$proportion - r$proportion))
178  # The max L1 distance between two distributions is 2; the max total variation
179  # distance is 1.
180  total_variation <- l1 / 2
181
182  # Choose false positive strings and their proportion from rappor estimates
183  false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')]
184  false_neg <- a[a$index %in% actual_only, c('index', 'proportion')]
185
186  Log("False positives:")
187  str(false_pos)
188
189  Log("False negatives:")
190  str(false_neg)
191
192  # NOTE: We should call Decode() directly, and then num_rappor is
193  # metrics$num_detected, and sum_proportion is metrics$allocated_mass.
194  metrics <- list(
195      num_actual = nrow(actual),  # data frames
196      num_rappor = nrow(rappor),
197      num_false_pos = nrow(false_pos),
198      num_false_neg = nrow(false_neg),
199      total_variation = total_variation,
200      sum_proportion = sum(rappor$proportion)
201      )
202
203  Log("Metrics:")
204  str(metrics)
205
206  # Return plot data and metrics
207  list(plot_data = rbind(r, a), metrics = metrics)
208}
209
210# Colors selected to be friendly to the color blind:
211# http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/
212palette <- c("#E69F00", "#56B4E9")
213
214PlotAll <- function(d, title) {
215  # NOTE: geom_bar makes a histogram by default; need stat = "identity"
216  g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist)))
217  b <- geom_bar(stat = "identity", width = 0.7,
218                position = position_dodge(width = 0.8))
219  t <- ggtitle(title)
220  g + b + t + scale_fill_manual(values=palette)
221}
222
223WritePlot <- function(p, outdir, width = 800, height = 600) {
224  filename <- file.path(outdir, 'dist.png')
225  png_func(filename, width=width, height=height)
226  plot(p)
227  dev.off()
228  Log('Wrote %s', filename)
229}
230
231WriteSummary <- function(metrics, outdir) {
232  filename <- file.path(outdir, 'metrics.csv')
233  write.csv(metrics, file = filename, row.names = FALSE)
234  Log('Wrote %s', filename)
235}
236
237main <- function(parsed) {
238  args <- parsed$args
239  options <- parsed$options
240
241  input_case_prefix <- args[[1]]
242  input_instance_prefix <- args[[2]]
243  output_dir <- args[[3]]
244
245  # increase ggplot font size globally
246  theme_set(theme_grey(base_size = 16))
247
248  # NOTE: It takes more than 2000+ ms to get here, while the analysis only
249  # takes 500 ms or so (as measured by system.time).
250
251  ctx <- LoadContext(input_case_prefix)
252  ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx)
253  ctx$actual <- LoadActual(input_instance_prefix)
254
255  d <- CompareRapporVsActual(ctx)
256  p <- PlotAll(d$plot_data, options$title)
257
258  WriteSummary(d$metrics, output_dir)
259  WritePlot(p, output_dir)
260}
261
262if (is_main) {
263  main(parsed)
264}
265