1 /* 2 * Copyright (C) 2020 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.internal.util; 18 19 import android.annotation.NonNull; 20 import android.annotation.Nullable; 21 import android.util.SparseArray; 22 import android.util.SparseIntArray; 23 24 import java.util.ArrayList; 25 import java.util.Collections; 26 import java.util.List; 27 28 /** 29 * <p>A utility which processes a sequence of input (stream) and output the heavy hitters 30 * (the frequent ones).</p> 31 * 32 * @param <T> The type of the input. 33 * @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for 34 * the definion of heavy hitters and the list of algorithms for detecting it. 35 * <p> 36 * {@hide} 37 */ 38 public interface HeavyHitterSketch<T> { 39 /** 40 * Return the default implementation. 41 * 42 * @return The default implementation. 43 */ newDefault()44 static <V> @NonNull HeavyHitterSketch<V> newDefault() { 45 return new HeavyHitterSketchImpl<V>(); 46 } 47 48 /** 49 * Set the configuration with given parameters 50 * 51 * @param inputSize The amount of the input. 52 * @param capacity The maximum number of distinct input it should track; it defines the lower 53 * bound of the output. 54 */ setConfig(int inputSize, int capacity)55 void setConfig(int inputSize, int capacity); 56 57 /** 58 * Add a new input to the current sketch. 59 * 60 * @param newInstance The new input 61 */ add(@ullable T newInstance)62 void add(@Nullable T newInstance); 63 64 /** 65 * @param k The number of heavy hitters it should return, k < capacity, a value of 0 66 * will be equivalent to capacity - 1 67 * @param holder The list array into which the elements of the tops are to be stored; a new list 68 * would be created and returned if this parameter is null. 69 * @param freqs Optional, the frequencies of each items in the returned result 70 * @return The top K heavy hitters(frequent input) 71 */ 72 @Nullable getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs)73 List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs); 74 75 /** 76 * @param holder The list array into which the elements of the candidates are to be stored; a 77 * new list would be created and returned if this parameter is null. 78 * @return The candidate heavy hitters so far, it could include false postives. 79 */ 80 @Nullable getCandidates(@ullable List<T> holder)81 List<T> getCandidates(@Nullable List<T> holder); 82 83 /** 84 * Reset this heavy hitter counter 85 */ reset()86 void reset(); 87 88 /** 89 * @return The ratio of the input to be used as the validation data, Float.NaN means no 90 * validation is needed. 91 */ getRequiredValidationInputRatio()92 float getRequiredValidationInputRatio(); 93 94 /** 95 * The default implementation of the {@link HeavyHitterSketch}. 96 * 97 * <p>Currently it consists of two passes: the first pass will take the input into 98 * the MG(Misra–Gries) summary; while the secondary pass will validate the output against the 99 * input in order to eliminate false postivies.</p> 100 * 101 * <p>For sure there are better approaches which takes only one pass, but also comes along with 102 * overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough 103 * pick.</p> 104 * 105 * @param <T> The type of the input. 106 * @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries 107 * summary</a> for the detailed explanation of the algorithm. 108 */ 109 final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> { 110 /** 111 * The array to track the current heavy hitters, its size < {@link #mCapacity}. 112 */ 113 private final SparseArray<T> mObjects = new SparseArray<>(); 114 115 /** 116 * The frequencies of the current heavy hitters, its size < {@link #mCapacity}. 117 */ 118 private final SparseIntArray mFrequencies = new SparseIntArray(); 119 120 /** 121 * The amount of the input of each pass 122 */ 123 private int mPassSize; 124 125 /** 126 * The amount of the total input it expects 127 */ 128 private int mTotalSize; 129 130 /** 131 * The maximum number of distinct input it should track 132 */ 133 private int mCapacity; 134 135 /** 136 * The amount of inputs it already received. 137 */ 138 private int mNumInputs; 139 140 /** 141 * Whether or not it's configured properly. 142 */ 143 private boolean mConfigured; 144 145 /** 146 * Set the configuration with given parameters 147 * 148 * @param inputSize The amount of the input. 149 * @param capacity The maximum number of distinct input it should track; it defines the 150 * lower bound of the output. 151 */ setConfig(final int inputSize, final int capacity)152 public void setConfig(final int inputSize, final int capacity) { 153 if (inputSize < capacity || inputSize <= 1) { 154 mConfigured = false; 155 throw new IllegalArgumentException(); 156 } 157 reset(); 158 mTotalSize = inputSize; 159 mPassSize = inputSize >> 1; 160 mCapacity = capacity; 161 mConfigured = true; 162 } 163 164 /** 165 * Add a new input to the current sketch. 166 * 167 * @param newInstance The new input 168 */ 169 @Override add(@ullable final T newInstance)170 public void add(@Nullable final T newInstance) { 171 if (!mConfigured) { 172 throw new IllegalStateException(); 173 } 174 if (mNumInputs < mPassSize) { 175 addToMGSummary(newInstance); 176 } else if (mNumInputs < mTotalSize) { 177 // Validation pass 178 validate(newInstance); 179 } 180 } 181 182 /** 183 * Add an input to the MG summary. 184 * 185 * <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair 186 * in the result set, will have the following property: 187 * <code>(freq - inputSize / capacity) ≤ freq' ≤ freq</code> 188 * The above freq' is the estimated frequency, while the freq is the actual frequency. 189 * </p> 190 */ addToMGSummary(@ullable final T newInstance)191 private void addToMGSummary(@Nullable final T newInstance) { 192 final int hashCode = newInstance != null ? newInstance.hashCode() : 0; 193 final int index = mObjects.indexOfKey(hashCode); 194 // MG summary 195 if (index >= 0) { 196 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1); 197 } else if (mObjects.size() < mCapacity - 1) { 198 mObjects.put(hashCode, newInstance); 199 mFrequencies.put(hashCode, 1); 200 } else { 201 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 202 final int val = mFrequencies.valueAt(i) - 1; 203 if (val == 0) { 204 mObjects.removeAt(i); 205 mFrequencies.removeAt(i); 206 } else { 207 mFrequencies.setValueAt(i, val); 208 } 209 } 210 } 211 if (++mNumInputs == mPassSize) { 212 // Clear all the frequencies as we are going to validate them next 213 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 214 mFrequencies.setValueAt(i, 0); 215 } 216 } 217 } 218 219 /** 220 * Validate the results from MG summary; the ones with frequencies less than lower boundary 221 * will be removed from the set. 222 */ validate(@ullable final T newInstance)223 private void validate(@Nullable final T newInstance) { 224 final int hashCode = newInstance != null ? newInstance.hashCode() : 0; 225 final int index = mObjects.indexOfKey(hashCode); 226 if (index >= 0) { 227 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1); 228 } 229 if (++mNumInputs == mTotalSize) { 230 final int lower = mPassSize / mCapacity; 231 // Remove any occurrences with frequencies less than lower boundary 232 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 233 final int val = mFrequencies.valueAt(i); 234 if (val < lower) { 235 mFrequencies.removeAt(i); 236 mObjects.removeAt(i); 237 } 238 } 239 } 240 } 241 242 /** 243 * @param k The number of heavy hitters it should return, k < capacity, a value of 0 244 * will be equivalent to capacity - 1 245 * @param holder The list array into which the elements of the tops are to be stored; a new 246 * list would be created and returned if this parameter is null. 247 * @param freqs Optional, the frequencies of each items in the returned result 248 * @return The top K heavy hitters(frequent input) 249 */ 250 @Override 251 @Nullable getTopHeavyHitters(final int k, final @Nullable List<T> holder, final @Nullable List<Float> freqs)252 public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder, 253 final @Nullable List<Float> freqs) { 254 if (!mConfigured) { 255 throw new IllegalStateException(); 256 } 257 258 if (k >= mCapacity) { 259 throw new IllegalArgumentException(); 260 } 261 262 if (mNumInputs < mTotalSize) { 263 // It hasn't had all the inputs yet. 264 throw new IllegalStateException(); 265 } 266 267 ArrayList<Integer> indexes = null; 268 for (int i = mFrequencies.size() - 1; i >= 0; i--) { 269 final int val = mFrequencies.valueAt(i); 270 if (val > 0) { 271 if (indexes == null) { 272 indexes = new ArrayList<>(); 273 } 274 indexes.add(i); 275 } 276 } 277 if (indexes == null) { 278 return null; 279 } 280 281 Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a)); 282 283 final List<T> result = holder != null ? holder : new ArrayList<T>(); 284 final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size()); 285 for (int i = 0; i < max; i++) { 286 final int index = indexes.get(i); 287 final T obj = mObjects.valueAt(index); 288 if (obj != null) { 289 result.add(obj); 290 if (freqs != null) { 291 freqs.add((float) mFrequencies.valueAt(index) / mPassSize); 292 } 293 } 294 } 295 return result; 296 } 297 298 /** 299 * @param holder The list array into which the elements of the candidates are to be stored; 300 * a new list would be created and returned if this parameter is null. 301 * @return The candidate heavy hitters so far, it could include false postives. 302 */ 303 @Nullable getCandidates(final @Nullable List<T> holder)304 public List<T> getCandidates(final @Nullable List<T> holder) { 305 if (!mConfigured) { 306 throw new IllegalStateException(); 307 } 308 if (mNumInputs < mPassSize) { 309 // It hasn't done with the first pass yet, return nothing 310 return null; 311 } 312 313 List<T> result = holder != null ? holder : new ArrayList<T>(); 314 for (int i = mObjects.size() - 1; i >= 0; i--) { 315 final T obj = mObjects.valueAt(i); 316 if (obj != null) { 317 result.add(obj); 318 } 319 } 320 return result; 321 } 322 323 /** 324 * Reset this heavy hitter counter 325 */ 326 @Override reset()327 public void reset() { 328 mNumInputs = 0; 329 mObjects.clear(); 330 mFrequencies.clear(); 331 } 332 333 /** 334 * @return The ratio of the input to be used as the validation data, Float.NaN means no 335 * validation is needed. 336 */ getRequiredValidationInputRatio()337 public float getRequiredValidationInputRatio() { 338 return 0.5f; 339 } 340 } 341 } 342