1#!/usr/bin/env Rscript 2# 3# Copyright 2014 Google Inc. All rights reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17# Simple tool that wraps the analysis/R library. 18# 19# To run this you need: 20# - ggplot 21# - optparse 22# - glmnet -- dependency of analysis library 23 24library(optparse) 25 26# For unit tests 27is_main <- (length(sys.frames()) == 0) 28 29# Do command line parsing first to catch errors. Loading libraries in R is 30# slow. 31if (is_main) { 32 option_list <- list( 33 make_option(c("-t", "--title"), help="Plot Title") 34 ) 35 parsed <- parse_args(OptionParser(option_list = option_list), 36 positional_arguments = 3) # input and output 37} 38 39library(ggplot2) 40 41# Use CairoPNG if available. Useful for headless R. 42if (library(Cairo, quietly = TRUE, logical.return = TRUE)) { 43 png_func = CairoPNG 44 cat('Using CairoPNG\n') 45} else { 46 png_func = png 47 cat('Using png\n') 48} 49 50source("analysis/R/read_input.R") 51source("analysis/R/decode.R") 52source("analysis/R/util.R") 53 54source("analysis/R/alternative.R") # temporary 55 56LoadContext <- function(prefix_case) { 57 # Creates the context, filling it with privacy parameters 58 # Arg: 59 # prefix_case: path prefix to the test case, e.g. '_tmp/exp' 60 61 p <- paste0(prefix_case, '_params.csv') 62 63 params <- ReadParameterFile(p) 64 65 ctx <- new.env() 66 67 ctx$params <- params # so we can write it out later 68 69 ctx 70} 71 72RunRappor <- function(prefix_case, prefix_instance, ctx) { 73 # Reads counts, map files, runs RAPPOR analysis engine. 74 # Args: 75 # prefix_case: path prefix to the test case, e.g., '_tmp/exp' 76 # prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1' 77 # ctx: context file with params field filled in 78 79 c <- paste0(prefix_instance, '_counts.csv') 80 counts <- ReadCountsFile(c, ctx$params) 81 82 m <- paste0(prefix_case, '_map.csv') 83 84 # Switch to LoadMapFile if want to cache the result 85 map <- ReadMapFile(m, ctx$params) 86 87 # Main decode.R API 88 timing <- system.time({ 89 res <- Decode(counts, map$map, ctx$params) 90 }) 91 92 # The line is searched for, and elapsed time is extracted, by make_summary.py. 93 # Should the formating or wording change, make_summary must be updated too. 94 Log("Inference took %.3f seconds", timing[["elapsed"]]) 95 96 if (is.null(res)) { 97 stop("RAPPOR analysis failed.") 98 } 99 100 Log("Decoded results:") 101 str(res$fit) 102 103 res$fit 104} 105 106LoadActual <- function(prefix_instance) { 107 hist_path <- paste0(prefix_instance, '_hist.csv') # case.csv 108 109 # gen_counts.R (fast_counts mode) outputs this, since we never have true 110 # client values. 111 if (file.exists(hist_path)) { 112 return(read.csv(hist_path)) 113 } 114 115 # Load ground truth into context 116 input_path <- paste0(prefix_instance, '_true_values.csv') # case.csv 117 client_values <- read.csv(input_path) 118 119 # Create a histogram, or R "table". Column 2 is the true value. 120 t <- table(client_values$value) 121 122 d <- as.data.frame(t) # convert it to a data frame with 'string' and 'count' columns 123 colnames(d) <- c('string', 'count') 124 125 d # return this data frame 126} 127 128CompareRapporVsActual <- function(ctx) { 129 # Prepare input data to be plotted 130 131 actual <- ctx$actual # from the ground truth file 132 rappor <- ctx$rappor # from output of AnalyzeRAPPOR 133 134 # "s12" -> 12, for graphing 135 StringToInt <- function(x) as.integer(substring(x, 2)) 136 137 actual_values <- StringToInt(actual$string) 138 rappor_values <- StringToInt(rappor$string) 139 140 # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it 141 # occurs too rarely) 142 actual_only <- setdiff(actual_values, rappor_values) 143 144 # False positives: AnalyzeRAPPOR attributed a proportion to a string in the 145 # map that wasn't in the true input. 146 rappor_only <- setdiff(rappor_values, actual_values) 147 148 total <- sum(actual$count) 149 a <- data.frame(index = actual_values, 150 # Calculate the true proportion 151 proportion = actual$count / total, 152 dist = "actual") 153 154 r <- data.frame(index = rappor_values, 155 proportion = rappor$proportion, 156 dist = rep("rappor", length(rappor_values))) 157 158 # Extend a and r with the values that they are missing. 159 if (length(rappor_only) > 0) { 160 z <- data.frame(index = rappor_only, 161 proportion = 0.0, 162 dist = "actual") 163 a <- rbind(a, z) 164 } 165 if (length(actual_only) > 0) { 166 z <- data.frame(index = actual_only, 167 proportion = 0.0, 168 dist = "rappor") 169 r <- rbind(r, z) 170 } 171 172 # IMPORTANT: Now a and r have the same rows, but in the wrong order. Sort by index. 173 a <- a[order(a$index), ] 174 r <- r[order(r$index), ] 175 176 # L1 distance between actual and rappor distributions 177 l1 <- sum(abs(a$proportion - r$proportion)) 178 # The max L1 distance between two distributions is 2; the max total variation 179 # distance is 1. 180 total_variation <- l1 / 2 181 182 # Choose false positive strings and their proportion from rappor estimates 183 false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')] 184 false_neg <- a[a$index %in% actual_only, c('index', 'proportion')] 185 186 Log("False positives:") 187 str(false_pos) 188 189 Log("False negatives:") 190 str(false_neg) 191 192 # NOTE: We should call Decode() directly, and then num_rappor is 193 # metrics$num_detected, and sum_proportion is metrics$allocated_mass. 194 metrics <- list( 195 num_actual = nrow(actual), # data frames 196 num_rappor = nrow(rappor), 197 num_false_pos = nrow(false_pos), 198 num_false_neg = nrow(false_neg), 199 total_variation = total_variation, 200 sum_proportion = sum(rappor$proportion) 201 ) 202 203 Log("Metrics:") 204 str(metrics) 205 206 # Return plot data and metrics 207 list(plot_data = rbind(r, a), metrics = metrics) 208} 209 210# Colors selected to be friendly to the color blind: 211# http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/ 212palette <- c("#E69F00", "#56B4E9") 213 214PlotAll <- function(d, title) { 215 # NOTE: geom_bar makes a histogram by default; need stat = "identity" 216 g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist))) 217 b <- geom_bar(stat = "identity", width = 0.7, 218 position = position_dodge(width = 0.8)) 219 t <- ggtitle(title) 220 g + b + t + scale_fill_manual(values=palette) 221} 222 223WritePlot <- function(p, outdir, width = 800, height = 600) { 224 filename <- file.path(outdir, 'dist.png') 225 png_func(filename, width=width, height=height) 226 plot(p) 227 dev.off() 228 Log('Wrote %s', filename) 229} 230 231WriteSummary <- function(metrics, outdir) { 232 filename <- file.path(outdir, 'metrics.csv') 233 write.csv(metrics, file = filename, row.names = FALSE) 234 Log('Wrote %s', filename) 235} 236 237main <- function(parsed) { 238 args <- parsed$args 239 options <- parsed$options 240 241 input_case_prefix <- args[[1]] 242 input_instance_prefix <- args[[2]] 243 output_dir <- args[[3]] 244 245 # increase ggplot font size globally 246 theme_set(theme_grey(base_size = 16)) 247 248 # NOTE: It takes more than 2000+ ms to get here, while the analysis only 249 # takes 500 ms or so (as measured by system.time). 250 251 ctx <- LoadContext(input_case_prefix) 252 ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx) 253 ctx$actual <- LoadActual(input_instance_prefix) 254 255 d <- CompareRapporVsActual(ctx) 256 p <- PlotAll(d$plot_data, options$title) 257 258 WriteSummary(d$metrics, output_dir) 259 WritePlot(p, output_dir) 260} 261 262if (is_main) { 263 main(parsed) 264} 265