#!/usr/bin/python # # Copyright 2014 Google Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Run the RAPPOR Python client on simulated input. It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5 column CSV of RAPPOR'd data. Input columns: client,true_value Output coumns: client,cohort,bloom,prr,rappor TODO: - cohort should be in the input _input.csv file. See http://google.github.io/rappor/doc/data-flow.html for details. """ import csv import collections import optparse import os import random import sys import time import rappor # client library try: import fastrand except ImportError: print >>sys.stderr, ( "Native fastrand module not imported; see README for speedups") fastrand = None def log(msg, *args): if args: msg = msg % args print >>sys.stderr, msg def CreateOptionsParser(): p = optparse.OptionParser() p.add_option( '--num-bits', type='int', metavar='INT', dest='num_bits', default=16, help='Number of bloom filter bits.') p.add_option( '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2, help='Number of hashes.') p.add_option( '--num-cohorts', type='int', metavar='INT', dest='num_cohorts', default=64, help='Number of cohorts.') p.add_option( '-p', type='float', metavar='FLOAT', dest='prob_p', default=1, help='Probability p') p.add_option( '-q', type='float', metavar='FLOAT', dest='prob_q', default=1, help='Probability q') p.add_option( '-f', type='float', metavar='FLOAT', dest='prob_f', default=1, help='Probability f') p.add_option( '--assoc-testdata', type='int', dest='assoc_testdata', default=0, help='Generate association testdata from true values on stdin.') choices = ['simple', 'fast'] p.add_option( '-r', type='choice', metavar='STR', dest='random_mode', default='fast', choices=choices, help='Random algorithm (%s)' % '|'.join(choices)) return p def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count, csv_in, csv_out): """Read true values from csv_in and output encoded values on csv_out. Replicate assoc_testdata_count times. First value is a string, second is a bool. TODO: Generalize this. """ rows = [] for i, (true_value1, true_value2) in enumerate(csv_in): if i == 0: v1_name = true_value1 v2_name = true_value2 continue # skip header row rows.append((true_value1, true_value2)) # Use the same column names header = ('client', 'cohort', v1_name, v2_name) csv_out.writerow(header) n = assoc_testdata_count report_index = 0 for i in xrange(n): for v1, v2 in rows: client_str = 'c%d' % report_index # randint(a, b) gives i such that a <= i <= b cohort = random.randint(0, params1.num_cohorts - 1) string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand) bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand) # Real users should call e.encode(). For testing purposes, we also want # the PRR. irr1 = string_encoder.encode(v1) # TODO: Convert to bool and encode with basic RAPPOR v2_int = int(v2) #print v2_int irr2 = bool_encoder.encode_bits(v2_int) irr1_str = rappor.bit_string(irr1, params1.num_bloombits) irr2_str = rappor.bit_string(irr2, params2.num_bloombits) csv_out.writerow((client_str, cohort, irr1_str, irr2_str)) report_index += 1 def RapporClientSim(params, irr_rand, csv_in, csv_out): """Read true values from csv_in and output encoded values on csv_out.""" header = ('client', 'cohort', 'bloom', 'prr', 'irr') csv_out.writerow(header) # TODO: It would be more instructive/efficient to construct an encoder # instance up front per client, rather than one per row below. start_time = time.time() for i, (client_str, cohort_str, true_value) in enumerate(csv_in): if i == 0: if client_str != 'client': raise RuntimeError('Expected client header, got %s' % client_str) if cohort_str != 'cohort': raise RuntimeError('Expected cohort header, got %s' % cohort_str) if true_value != 'value': raise RuntimeError('Expected value header, got %s' % value) continue # skip header row #if i == 30: # EARLY STOP # break if i % 10000 == 0: elapsed = time.time() - start_time log('Processed %d inputs in %.2f seconds', i, elapsed) cohort = int(cohort_str) secret = client_str e = rappor.Encoder(params, cohort, secret, irr_rand) # Real users should call e.encode(). For testing purposes, we also want # the PRR. bloom, prr, irr = e._internal_encode(true_value) bloom_str = rappor.bit_string(bloom, params.num_bloombits) prr_str = rappor.bit_string(prr, params.num_bloombits) irr_str = rappor.bit_string(irr, params.num_bloombits) out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str) csv_out.writerow(out_row) def main(argv): (opts, argv) = CreateOptionsParser().parse_args(argv) # Copy flags into params params = rappor.Params() params.num_bloombits = opts.num_bits params.num_hashes = opts.num_hashes params.num_cohorts = opts.num_cohorts params.prob_p = opts.prob_p params.prob_q = opts.prob_q params.prob_f = opts.prob_f if opts.random_mode == 'simple': irr_rand = rappor.SecureIrrRand(params) elif opts.random_mode == 'fast': if fastrand: log('Using fastrand extension') # NOTE: This doesn't take 'rand'. It's seeded in C with srand(). irr_rand = fastrand.FastIrrRand(params) else: log('Warning: fastrand module not importable; see README for build ' 'instructions. Falling back to simple randomness.') irr_rand = rappor.SecureIrrRand(params) else: raise AssertionError # Other possible implementations: # - random.SystemRandom (probably uses /dev/urandom on Linux) # - HMAC-SHA256 with another secret? This could match C++ byte for byte. # - or srand(0) might do it. csv_in = csv.reader(sys.stdin) csv_out = csv.writer(sys.stdout) if opts.assoc_testdata: # Copy flags into params params1 = rappor.Params() params1.num_bloombits = opts.num_bits params1.num_hashes = opts.num_hashes params1.num_cohorts = opts.num_cohorts params1.prob_p = opts.prob_p params1.prob_q = opts.prob_q params1.prob_f = opts.prob_f # Second one is boolean params2 = rappor.Params() params2.num_bloombits = 1 # 1 bit for boolean params2.num_hashes = opts.num_hashes params2.num_cohorts = opts.num_cohorts params2.prob_p = opts.prob_p params2.prob_q = opts.prob_q params2.prob_f = opts.prob_f GenAssocTestdata( params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out) else: RapporClientSim(params, irr_rand, csv_in, csv_out) if __name__ == "__main__": try: main(sys.argv) except RuntimeError, e: log('rappor_sim.py: FATAL: %s', e)