1 package com.google.android.rappor; 2 3 // BEGIN android-changed: Removed guava dependency 4 // import static com.google.common.base.Preconditions.checkArgument; 5 // 6 // import com.google.common.base.Verify; 7 // END android-changed 8 9 import java.nio.ByteBuffer; 10 import java.nio.charset.StandardCharsets; 11 import java.security.MessageDigest; 12 import java.security.NoSuchAlgorithmException; 13 import java.security.SecureRandom; 14 import java.util.BitSet; 15 // BEGIN android-changed 16 import java.util.Random; 17 // END android-changed 18 19 // BEGIN android-changed: Remove javax 20 // import javax.annotation.Nullable; 21 // import javax.annotation.concurrent.GuardedBy; 22 // END android-changed 23 24 /** 25 * Encodes reports using the RAPPOR differentially-private encoding algorithm. 26 * 27 * The RAPPOR algorithm is described at go/rappor and presented in detail at go/rappor-writeup. 28 * 29 * @author bonawitz@google.com Keith Bonawitz 30 */ 31 // TODO(bonawitz): Make encoder and interface and make this a final class implementing it. 32 // We can't just make this final now because there exist tests that need to Mock it. 33 public class Encoder { 34 /** 35 * A non-decreasing version number. 36 * 37 * <p>The version number should increase any time the Encoder has a user-visible functional change 38 * to any of encoding algorithms or the interpretation of the input parameters. 39 */ 40 public static final long VERSION = 3; 41 42 /** 43 * Minimum length required for the user secret, in bytes. 44 */ 45 public static final int MIN_USER_SECRET_BYTES = HmacDrbg.ENTROPY_INPUT_SIZE_BYTES; 46 47 /** 48 * Maximum number of bits in the RAPPOR-encoded report. 49 * 50 * Must be less than HmacDrbg.MAX_BYTES_TOTAL; 51 */ 52 public static final int MAX_BITS = 4096; 53 54 /** 55 * Maximum number of Bloom filter hashes used for RAPPOR-encoded strings. 56 * 57 * <p>This is constrained in the current implementation by requiring 2 bytes from an MD5 value 58 * (which is 16 bytes long) for each Bloom filter hash. 59 */ 60 public static final int MAX_BLOOM_HASHES = 8; 61 62 /** 63 * Maximum number of cohorts supported. 64 */ 65 public static final int MAX_COHORTS = 128; 66 67 /** 68 * First (and only) byte of HMAC_DRBG personalization strings used to generate the cohort number. 69 */ 70 private static final byte HMAC_DRBG_TYPE_COHORT = 0x00; 71 72 /** 73 * First byte of HMAC_DRBG personalization strings used to generate the PRR response. 74 */ 75 private static final byte HMAC_DRBG_TYPE_PRR = 0x01; 76 77 /** 78 * A unique identifier for this Encoder, represented in UTF-8. 79 * 80 * <p>The (userSecret, encoderId) pair identify a the logical memoization table used for RAPPOR's 81 * Permanent Randomized Response stage. Therefore, for any userSecret, each Encoder must have a 82 * distinct identifier for Permanent Randomized Response to be effective. 83 * 84 * <p>In practice, "memoization" is achieved by generating deterministic pseudo-random bits using 85 * HMAC_DRBG. encoderIdBytes is used as part of the personalization string. 86 */ 87 private final byte[] encoderIdBytes; 88 89 /** 90 * The RAPPOR "f" probability, on the range [0.0, 1.0]. 91 * 92 * <p>This it the probability of replacing a bit from the input with a uniform random bit when 93 * generating the permanent randomized response. 94 * 95 * <p>Setting probabilityF to 0 disables the PRR phase of RAPPOR. 96 */ 97 private final double probabilityF; 98 99 /** 100 * The RAPPOR "p" probability, on the range [0.0, 1.0]. 101 * 102 * <p>This is the probability of emitting a '1' bit in the instantaneous randomized response, 103 * given that the corresponding bit in the permanent randomized response is '0'. 104 * 105 * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR. 106 */ 107 private final double probabilityP; 108 109 /** 110 * The RAPPOR "1" probability, on the range [0.0, 1.0]. 111 * 112 * <p>This is the probability of emitting a 1 bit in the instantaneous randomized response, given 113 * that the corresponding bit in the permanent randomized response is 1. 114 * 115 * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR. 116 */ 117 private final double probabilityQ; 118 119 /** 120 * The number of bits in the RAPPOR-encoded report. 121 * 122 * <p>Must satisfy 1 <= numBits <= MAX_BITS. 123 * 124 * <ul> 125 * <li>For encodeOrdinal, requires 0 <= ordinal < numBits. 126 * <li>For encodeString, uses a numBits-wide Bloom filter. 127 * <li>For encodeBits, only the low-order numBits may be non-zero. 128 * </ul> 129 */ 130 private final int numBits; 131 132 /** 133 * The number of hash functions used forming the Bloom filter encoding of a string. 134 * 135 * <p>Must satisfy 1 <= numBloomHashes <= MAX_BLOOM_HASHES. 136 */ 137 private final int numBloomHashes; 138 139 /** 140 * The number of cohorts used for cohort assignment. 141 */ 142 private final int numCohorts; 143 144 /** 145 * The cohort this user is assigned to for Bloom filter string encoding. 146 * 147 * <p>This value is stable for a given (userSecret, numCohorts) pair. Furthermore, if two 148 * encoders use the same userSecret but have different numCohorts values, the cohort assignment 149 * for the encoder with the smaller numCohorts value is a bitwise suffix of the cohort assignment 150 * for the encoder with the larger numCohorts value. It follows that, if you know maximum cohort 151 * assignment across a set of encoders, and you know the numCohorts setting for each encoder, then 152 * you can deduce the cohort assignment for each encoder by taking the bitwise-and of the max 153 * cohort value and (numCohorts-1), noting that numCohorts is required to be a positive power of 154 * 2. 155 */ 156 private final int cohort; 157 158 /** 159 * A bitmask with 1 bits in all the positions where a RAPPOR-encoded report could have a 1 bit. 160 * 161 * <p>The bitmask has the lowest order numBits set to 1 and the rest 0. 162 */ 163 private final BitSet inputMask; 164 165 /** 166 * SHA-256 utility class instance. 167 * 168 * <p>This object is stateful; access must be synchronized. The reset method must be 169 * called before each use. 170 */ 171 // BEGIN android-changed: Remove javax 172 // @GuardedBy("this") 173 // END android-changed 174 private final MessageDigest sha256; 175 176 /** 177 * MD5 utility class instance. 178 * 179 * <p>This object is stateful; access must be synchronized. The reset method must be 180 * called before each use. 181 */ 182 // BEGIN android-changed: Remove javax 183 // @GuardedBy("this") 184 // END android-changed 185 private final MessageDigest md5; 186 187 /** 188 * A SecureRandom instance, initialized with a cryptographically secure random seed. 189 */ 190 // BEGIN android-changed 191 // private final SecureRandom random; 192 private final Random random; 193 // BEGIN android-changed 194 195 /** 196 * Entropy input for constructing HmacDrbg objects. 197 */ 198 private final byte[] userSecret; 199 200 /** 201 * Constructs a new RAPPOR message encoder. 202 * 203 * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least 204 * MIN_USER_SECRET_BYTES bytes of high-quality entropy. Changing the user secret clears the 205 * memoized cohort assignments and permanent randomized responses. Be aware that resetting 206 * these memoizations has significant privacy risks -- consult documentation at go/rappor for 207 * more details. 208 * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized 209 * cohort assignments and permanent randomized responses. 210 * @param numBits The number of bits in the RAPPOR-encoded report. 211 * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be 212 * quantized to the nearest 1/128. 213 * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0]. 214 * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0]. 215 * @param numCohorts Number of cohorts into which the user pool is randomly segmented. 216 * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a 217 * string. 218 */ Encoder(byte[] userSecret, String encoderId, int numBits, double probabilityF, double probabilityP, double probabilityQ, int numCohorts, int numBloomHashes)219 public Encoder(byte[] userSecret, String encoderId, int numBits, 220 double probabilityF, double probabilityP, double probabilityQ, 221 int numCohorts, int numBloomHashes) { 222 this( 223 null, // random 224 null, // md5, 225 null, // sha256, 226 userSecret, 227 encoderId, 228 numBits, 229 probabilityF, 230 probabilityP, 231 probabilityQ, 232 numCohorts, 233 numBloomHashes); 234 } 235 236 /** 237 * Constructs a new RAPPOR message encoder, using constructor-style dependency injection. 238 * 239 * @param random A cryptographically secure random number generator, or null (in which case a 240 * new SecureRandom will be constructed). 241 * @param md5 A configured MD5 hash algorithm, or null (in which case a new MessageDigest will be 242 * constructed). Note: MessageDigest objects are stateful, and that state must not be 243 * modified while calls to the Encoder are active. 244 * @param sha256 A configured SHA-256 hash algorithm, or null (in which case a new MessageDigest 245 * will be constructed). Note: MessageDigest objects are stateful, and that state must not 246 * be modified while calls to the Encoder are active. 247 * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least 248 * 32 bytes of high-quality entropy. Changing the user secret clears the memoized cohort 249 * assignments and permanent randomized responses. Be aware that resetting these memoizations 250 * has significant privacy risks -- consult documentation at go/rappor for more details. 251 * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized 252 * cohort assignments and permanent randomized responses. 253 * @param numBits The number of bits in the RAPPOR-encoded report. 254 * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be 255 * quantized to the nearest 1/128. 256 * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0]. 257 * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0]. 258 * @param numCohorts Number of cohorts into which the user pool is randomly segmented. 259 * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a 260 * string. 261 */ Encoder( Random random, MessageDigest md5, MessageDigest sha256, byte[] userSecret, String encoderId, int numBits, double probabilityF, double probabilityP, double probabilityQ, int numCohorts, int numBloomHashes)262 public Encoder( 263 // BEGIN android-changed: Remove javax 264 // @Nullable SecureRandom random, 265 // @Nullable MessageDigest md5, 266 // @Nullable MessageDigest sha256, 267 // SecureRandom random, 268 Random random, 269 MessageDigest md5, 270 MessageDigest sha256, 271 // END android-changed 272 byte[] userSecret, 273 String encoderId, 274 int numBits, 275 double probabilityF, 276 double probabilityP, 277 double probabilityQ, 278 int numCohorts, 279 int numBloomHashes) { 280 if (md5 != null) { 281 this.md5 = md5; 282 } else { 283 try { 284 this.md5 = MessageDigest.getInstance("MD5"); 285 } catch (NoSuchAlgorithmException impossible) { 286 // This should never happen. Every implementation of the Java platform 287 // is required to support MD5. 288 throw new AssertionError(impossible); 289 } 290 } 291 this.md5.reset(); 292 293 if (sha256 != null) { 294 this.sha256 = sha256; 295 } else { 296 try { 297 this.sha256 = MessageDigest.getInstance("SHA-256"); 298 } catch (NoSuchAlgorithmException impossible) { 299 // This should never happen. Every implementation of the Java platform 300 // is required to support 256. 301 throw new AssertionError(impossible); 302 } 303 } 304 this.sha256.reset(); 305 306 this.encoderIdBytes = encoderId.getBytes(StandardCharsets.UTF_8); 307 308 if (random != null) { 309 this.random = random; 310 } else { 311 this.random = new SecureRandom(); 312 } 313 314 // android-changed: Removed guava dependency 315 // checkArgument( 316 // userSecret.length >= MIN_USER_SECRET_BYTES, 317 // "userSecret must be at least %s bytes of high-quality entropy.", 318 // MIN_USER_SECRET_BYTES); 319 checkArgument( 320 userSecret.length >= MIN_USER_SECRET_BYTES, 321 "userSecret must be at least " + MIN_USER_SECRET_BYTES 322 + " bytes of high-quality entropy."); 323 this.userSecret = userSecret; 324 325 checkArgument( 326 probabilityF >= 0 && probabilityF <= 1, "probabilityF must be on range [0.0, 1.0]"); 327 this.probabilityF = Math.round(probabilityF * 128) / 128.0; 328 329 checkArgument( 330 probabilityP >= 0 && probabilityP <= 1, "probabilityP must be on range [0.0, 1.0]"); 331 this.probabilityP = probabilityP; 332 333 checkArgument( 334 probabilityQ >= 0 && probabilityQ <= 1, "probabilityQ must be on range [0.0, 1.0]"); 335 this.probabilityQ = probabilityQ; 336 337 checkArgument( 338 numBits >= 1 && numBits <= MAX_BITS, "numBits must be on range [1, " + MAX_BITS + "]."); 339 this.numBits = numBits; 340 // Make a bitmask with the lowest-order numBits set to 1. 341 this.inputMask = new BitSet(numBits); 342 this.inputMask.set(0, numBits, true); 343 344 checkArgument( 345 numBloomHashes >= 1 && numBloomHashes <= numBits, 346 "numBloomHashes must be on range [1, numBits)."); 347 this.numBloomHashes = numBloomHashes; 348 349 checkArgument( 350 numCohorts >= 1 && numCohorts <= MAX_COHORTS, 351 "numCohorts must be on range [1, " + MAX_COHORTS + "]."); 352 353 // Assuming numCohorts >= 1: 354 // 355 // If numCohorts is a power of 2, then numCohorts has one bit set and numCohorts - 1 has only 356 // the bits to the right of numCohorts's bit set. The bitwise-and of these is 0. 357 // 358 // If numCohorts is not a power of 2, then numCohorts has at least two bits set. It follows 359 // subtracting one from numCohorts keeps the most significant bit set to 1, and thus the 360 // bitwise-and has at least this non-zero bit. 361 boolean numCohortsIsPowerOfTwo = (numCohorts & (numCohorts - 1)) == 0; 362 checkArgument(numCohortsIsPowerOfTwo, "numCohorts must be a power of 2."); 363 this.numCohorts = numCohorts; 364 365 // cohortMasterAssignment depends only on the userSecret. 366 HmacDrbg cohortDrbg = new HmacDrbg(userSecret, new byte[] {HMAC_DRBG_TYPE_COHORT}); 367 ByteBuffer cohortDrbgBytes = ByteBuffer.wrap(cohortDrbg.nextBytes(4)); 368 int cohortMasterAssignment = Math.abs(cohortDrbgBytes.getInt()) % MAX_COHORTS; 369 this.cohort = cohortMasterAssignment & (numCohorts - 1); 370 } 371 getProbabilityF()372 public double getProbabilityF() { 373 return probabilityF; 374 } 375 getProbabilityP()376 public double getProbabilityP() { 377 return probabilityP; 378 } 379 getProbabilityQ()380 public double getProbabilityQ() { 381 return probabilityQ; 382 } 383 getNumBits()384 public int getNumBits() { 385 return numBits; 386 } 387 getNumBloomHashes()388 public int getNumBloomHashes() { 389 return numBloomHashes; 390 } 391 getNumCohorts()392 public int getNumCohorts() { 393 return numCohorts; 394 } 395 getCohort()396 public int getCohort() { 397 return cohort; 398 } 399 400 /** 401 * Returns the Encoder ID as a UTF-8 formatted string. 402 */ getEncoderId()403 public String getEncoderId() { 404 return new String(encoderIdBytes, StandardCharsets.UTF_8); 405 } 406 407 /** 408 * Encodes a boolean into a RAPPOR report. 409 * 410 * <p>The boolean is 0 or 1, then encoded using permanent and instantaneous randomized response. 411 * 412 * <p>In most cases, numBits should be 1 when using this method. 413 */ encodeBoolean(boolean bool)414 public byte[] encodeBoolean(boolean bool) { 415 BitSet input = new BitSet(numBits); 416 input.set(0, bool); 417 return encodeBits(input); 418 } 419 420 /** 421 * Encodes an ordinal into a RAPPOR report. 422 * 423 * <p>The ordinal is represented using a 1-hot binary representation, then encoded using permanent 424 * and instantaneous randomized response. 425 * 426 * @param ordinal A value on the range [0, numBits). 427 */ encodeOrdinal(int ordinal)428 public byte[] encodeOrdinal(int ordinal) { 429 checkArgument( 430 ordinal >= 0 && ordinal < numBits, "Ordinal value must be in range [0, numBits)."); 431 BitSet input = new BitSet(numBits); 432 input.set(ordinal, true); 433 return encodeBits(input); 434 } 435 436 /** 437 * Encodes a string into a RAPPOR report. 438 * 439 * <p>The string is represented using a Bloom filter with numBloomHashes hash functions, then 440 * encoded using permanent and instantaneous randomized response. 441 * 442 * @param string An arbitrary string. 443 */ encodeString(String string)444 public byte[] encodeString(String string) { 445 // Implements a Bloom filter by slicing a single MD5 hash into numBloomHashes bit indices. 446 byte[] stringInUtf8 = string.getBytes(StandardCharsets.UTF_8); 447 byte[] message = 448 ByteBuffer.allocate(4 + stringInUtf8.length) 449 .putInt(cohort) 450 .put(stringInUtf8) 451 .array(); 452 453 byte[] digest; 454 synchronized (this) { 455 md5.reset(); 456 digest = md5.digest(message); 457 } 458 // android-changed: Removed guava dependency 459 // Verify.verify(digest.length == 16); 460 // Verify.verify(numBloomHashes <= digest.length / 2); 461 verify(digest.length == 16); 462 verify(numBloomHashes <= digest.length / 2); 463 464 BitSet input = new BitSet(numBits); 465 for (int i = 0; i < numBloomHashes; i++) { 466 // Convert byte pairs to ints on [0, 65535]. 467 // Anding with 0xFF converts signed byte to unsigned int. 468 int digestWord = (digest[i * 2] & 0xFF) * 256 + (digest[i * 2 + 1] & 0xFF); 469 int chosenBit = digestWord % numBits; 470 input.set(chosenBit, true); 471 } 472 473 return encodeBits(input); 474 } 475 476 /** 477 * Encodes an arbitrary bitstring into a RAPPOR report. 478 * 479 * @param bits A bitstring in which only the least significant numBits bits may be 1. 480 */ encodeBits(byte[] bits)481 public byte[] encodeBits(byte[] bits) { 482 return encodeBits(BitSet.valueOf(bits)); 483 } 484 485 /** 486 * Encodes an arbitrary bitstring into a RAPPOR report. 487 * 488 * @param bits A bitstring in which only the least significant numBits bits may be 1. 489 */ encodeBits(BitSet bits)490 private byte[] encodeBits(BitSet bits) { 491 BitSet permanentRandomizedResponse = computePermanentRandomizedResponse(bits); 492 BitSet encodedBitSet = computeInstantaneousRandomizedResponse(permanentRandomizedResponse); 493 494 // BitSet.toByteArray only returns enough bytes to capture the most significant bit 495 // that is set. For example, a BitSet with no bits set could return a length-0 array. 496 // In contrast, we guarantee that our output is sized according to numBits. 497 byte[] encodedBytes = encodedBitSet.toByteArray(); 498 byte[] output = new byte[(numBits + 7) / 8]; 499 // android-changed: Removed guava dependency 500 // Verify.verify(encodedBytes.length <= output.length); 501 verify(encodedBytes.length <= output.length); 502 System.arraycopy( 503 encodedBytes, // src 504 0, // srcPos 505 output, // dest 506 0, // destPos 507 encodedBytes.length); // length 508 return output; 509 } 510 511 /** 512 * Returns the permanent randomized response for the given bits. 513 * 514 * <p>The response for a particular bits input is guaranteed to always be the same for any encoder 515 * constructed with the same parameters (including the encoderId and the userSecret). 516 */ computePermanentRandomizedResponse(BitSet bits)517 private BitSet computePermanentRandomizedResponse(BitSet bits) { 518 // Ensures that the input only has bits set in the lowest 519 BitSet masked = new BitSet(); 520 masked.or(bits); 521 masked.andNot(inputMask); 522 checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit."); 523 524 if (probabilityF == 0.0) { 525 return bits; 526 } 527 528 // Builds a personalization string that contains both the encoderId and input value (bits), 529 // and is no longer than HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES. The first byte 530 // of the personalization string is always HMAC_DRBG_TYPE_PRR, to avoid collisions with the 531 // cohort-generation personalization string. 532 byte[] personalizationString; 533 synchronized (this) { 534 int personalizationStringLength = 535 Math.min(HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES, 1 + sha256.getDigestLength()); 536 personalizationString = new byte[personalizationStringLength]; 537 personalizationString[0] = HMAC_DRBG_TYPE_PRR; 538 sha256.reset(); 539 sha256.update(encoderIdBytes); 540 sha256.update(new byte[] {0}); 541 sha256.update(bits.toByteArray()); 542 byte[] digest = sha256.digest(personalizationString); 543 System.arraycopy(digest, 0, personalizationString, 1, personalizationString.length - 1); 544 } 545 546 HmacDrbg drbg = new HmacDrbg(userSecret, personalizationString); 547 byte[] pseudorandomStream = drbg.nextBytes(numBits); 548 // android-changed: Removed guava dependency 549 // Verify.verify(numBits <= pseudorandomStream.length); 550 verify(numBits <= pseudorandomStream.length); 551 552 int probabilityFTimes128 = (int) Math.round(probabilityF * 128); 553 BitSet result = new BitSet(numBits); 554 for (int i = 0; i < numBits; i++) { 555 // Grabs a single byte from the pseudorandom stream. 556 // Anding with 0xFF converts a signed byte to an unsigned integer. 557 int pseudorandomByte = pseudorandomStream[i] & 0xFF; 558 559 // Uses bits 1-7 to get a random number between 0 and 127. 560 int uniform0to127 = pseudorandomByte >> 1; 561 boolean shouldUseNoise = uniform0to127 < probabilityFTimes128; 562 563 if (shouldUseNoise) { 564 // Uses bit 0 as a flip of a fair coin. 565 result.set(i, (pseudorandomByte & 0x01) > 0); 566 } else { 567 result.set(i, bits.get(i)); 568 } 569 } 570 return result; 571 } 572 573 /** 574 * Returns the instantaneous randomized response for the given bits. 575 * 576 * <p>The instantaneous response is NOT memoized -- it is sampled randomly on 577 * every invocation. 578 */ computeInstantaneousRandomizedResponse(BitSet bits)579 private BitSet computeInstantaneousRandomizedResponse(BitSet bits) { 580 // Ensures that the input only has bits set in the lowest 581 BitSet masked = new BitSet(); 582 masked.or(bits); 583 masked.andNot(inputMask); 584 checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit."); 585 586 if (probabilityP == 0.0 && probabilityQ == 1.0) { 587 return bits; 588 } 589 590 BitSet response = new BitSet(numBits); 591 for (int i = 0; i < numBits; i++) { 592 boolean bit = bits.get(i); 593 double probability = bit ? probabilityQ : probabilityP; 594 boolean responseBit = random.nextFloat() < probability; 595 response.set(i, responseBit); 596 } 597 return response; 598 } 599 600 // BEGIN android-changed: Added guava methods 601 private static void checkArgument(boolean expression, Object errorMessage) { 602 if (!expression) { 603 throw new IllegalArgumentException(String.valueOf(errorMessage)); 604 } 605 } 606 607 private static void verify(boolean expression) { 608 if (!expression) { 609 throw new java.lang.IllegalStateException(); 610 } 611 } 612 // END android-changed 613 } 614