1# Copyright 2014 Google Inc. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15# 16# Read parameter, counts and map files. 17 18library(Matrix) 19 20source.rappor <- function(rel_path) { 21 abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 22 source(abs_path) 23} 24 25source.rappor("analysis/R/util.R") # for Log 26 27 28ReadParameterFile <- function(params_file) { 29 # Read parameter file. Format: 30 # k, h, m, p, q, f 31 # 128, 2, 8, 0.5, 0.75, 0.75 32 33 params <- as.list(read.csv(params_file)) 34 if (length(params) != 6) { 35 stop("There should be exactly 6 columns in the parameter file.") 36 } 37 if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) { 38 stop("Parameter names must be k,h,m,p,q,f.") 39 } 40 params 41} 42 43# Handle the case of redundant cohorts, i.e. the counts file needs to be 44# further aggregated to obtain counts for the number of cohorts specified in 45# the params file. 46# 47# NOTE: Why is this happening? 48AdjustCounts <- function(counts, params) { 49 apply(counts, 2, function(x) { 50 tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) 51 }) 52} 53 54ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { 55 # Read in the counts file. 56 if (!file.exists(counts_file)) { 57 return(NULL) 58 } 59 counts <- read.csv(counts_file, header = FALSE) 60 61 if (adjust_counts) { 62 counts <- AdjustCounts(counts, params) 63 } 64 65 if (nrow(counts) != params$m) { 66 stop(sprintf("Got %d rows in the counts file, expected m = %d", 67 nrow(counts), params$m)) 68 } 69 70 if ((ncol(counts) - 1) != params$k) { 71 stop(paste0("Counts file: number of columns should equal to k + 1: ", 72 ncol(counts))) 73 } 74 75 if (any(counts < 0)) { 76 stop("Counts file: all counts must be positive.") 77 } 78 79 # Turn counts from a data frame into a matrix. (In R a data frame and matrix 80 # are sometimes interchangeable, but sometimes we need it to be matrix.) 81 as.matrix(counts) 82} 83 84ReadMapFile <- function(map_file, params) { 85 # Read in the map file which is in the following format (two hash functions): 86 # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ... 87 # str2, ... 88 # Output: 89 # map: a sparse representation of set bits for each candidate string. 90 # strs: a vector of all candidate strings. 91 92 Log("Parsing %s", map_file) 93 94 map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE) 95 strs <- map_pos[, 1] 96 strs[strs == ""] <- "Empty" 97 98 # Remove duplicated strings. 99 ind <- which(!duplicated(strs)) 100 strs <- strs[ind] 101 map_pos <- map_pos[ind, ] 102 103 n <- ncol(map_pos) - 1 104 if (n != (params$h * params$m)) { 105 stop(paste0("Map file: number of columns should equal hm + 1:", 106 n, "_", params$h * params$m)) 107 } 108 109 row_pos <- unlist(map_pos[, -1], use.names = FALSE) 110 col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1) 111 112 # TODO: When would this ever happen? 113 removed <- which(is.na(row_pos)) 114 if (length(removed) > 0) { 115 Log("Removed %d entries", length(removed)) 116 row_pos <- row_pos[-removed] 117 col_pos <- col_pos[-removed] 118 } 119 120 map <- sparseMatrix(row_pos, col_pos, 121 dims = c(params$m * params$k, length(strs))) 122 123 colnames(map) <- strs 124 list(map = map, strs = strs, map_pos = map_pos) 125} 126 127LoadMapFile <- function(map_file, params) { 128 # Reads the map file, caching an .rda (R binary data) version of it to speed 129 # up future loads. 130 131 rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE) 132 # This must be unique per process, so concurrent processes don't try to 133 # write the same file. 134 tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid()) 135 136 # First save to a temp file, and then atomically rename to the destination. 137 if (file.exists(rda_path)) { 138 Log("Loading %s", rda_path) 139 load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env 140 } else { 141 map <- ReadMapFile(map_file, params) 142 143 Log("Saving %s as an rda file for faster access", map_file) 144 tryCatch({ 145 save(map, file = tmp_path) 146 file.rename(tmp_path, rda_path) 147 }, warning = function(w) { 148 Log("WARNING: %s", w) 149 }, error = function(e) { 150 Log("ERROR: %s", e) 151 }) 152 } 153 return(map) 154} 155