• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.internal.util;
18 
19 import android.annotation.NonNull;
20 import android.annotation.Nullable;
21 import android.util.SparseArray;
22 import android.util.SparseIntArray;
23 
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.List;
27 
28 /**
29  * <p>A utility which processes a sequence of input (stream) and output the heavy hitters
30  * (the frequent ones).</p>
31  *
32  * @param <T> The type of the input.
33  * @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for
34  * the definion of heavy hitters and the list of algorithms for detecting it.
35  * <p>
36  * {@hide}
37  */
38 public interface HeavyHitterSketch<T> {
39     /**
40      * Return the default implementation.
41      *
42      * @return The default implementation.
43      */
newDefault()44     static <V> @NonNull HeavyHitterSketch<V> newDefault() {
45         return new HeavyHitterSketchImpl<V>();
46     }
47 
48     /**
49      * Set the configuration with given parameters
50      *
51      * @param inputSize The amount of the input.
52      * @param capacity  The maximum number of distinct input it should track; it defines the lower
53      *                  bound of the output.
54      */
setConfig(int inputSize, int capacity)55     void setConfig(int inputSize, int capacity);
56 
57     /**
58      * Add a new input to the current sketch.
59      *
60      * @param newInstance The new input
61      */
add(@ullable T newInstance)62     void add(@Nullable T newInstance);
63 
64     /**
65      * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
66      *               will be equivalent to capacity - 1
67      * @param holder The list array into which the elements of the tops are to be stored; a new list
68      *               would be created and returned if this parameter is null.
69      * @param freqs  Optional, the frequencies of each items in the returned result
70      * @return The top K heavy hitters(frequent input)
71      */
72     @Nullable
getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs)73     List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs);
74 
75     /**
76      * @param holder The list array into which the elements of the candidates are to be stored; a
77      *               new list would be created and returned if this parameter is null.
78      * @return The candidate heavy hitters so far, it could include false postives.
79      */
80     @Nullable
getCandidates(@ullable List<T> holder)81     List<T> getCandidates(@Nullable List<T> holder);
82 
83     /**
84      * Reset this heavy hitter counter
85      */
reset()86     void reset();
87 
88     /**
89      * @return The ratio of the input to be used as the validation data, Float.NaN means no
90      * validation is needed.
91      */
getRequiredValidationInputRatio()92     float getRequiredValidationInputRatio();
93 
94     /**
95      * The default implementation of the {@link HeavyHitterSketch}.
96      *
97      * <p>Currently it consists of two passes: the first pass will take the input into
98      * the MG(Misra–Gries) summary; while the secondary pass will validate the output against the
99      * input in order to eliminate false postivies.</p>
100      *
101      * <p>For sure there are better approaches which takes only one pass, but also comes along with
102      * overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough
103      * pick.</p>
104      *
105      * @param <T> The type of the input.
106      * @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries
107      * summary</a> for the detailed explanation of the algorithm.
108      */
109     final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> {
110         /**
111          * The array to track the current heavy hitters, its size &lt; {@link #mCapacity}.
112          */
113         private final SparseArray<T> mObjects = new SparseArray<>();
114 
115         /**
116          * The frequencies of the current heavy hitters, its size &lt; {@link #mCapacity}.
117          */
118         private final SparseIntArray mFrequencies = new SparseIntArray();
119 
120         /**
121          * The amount of the input of each pass
122          */
123         private int mPassSize;
124 
125         /**
126          * The amount of the total input it expects
127          */
128         private int mTotalSize;
129 
130         /**
131          * The maximum number of distinct input it should track
132          */
133         private int mCapacity;
134 
135         /**
136          * The amount of inputs it already received.
137          */
138         private int mNumInputs;
139 
140         /**
141          * Whether or not it's configured properly.
142          */
143         private boolean mConfigured;
144 
145         /**
146          * Set the configuration with given parameters
147          *
148          * @param inputSize The amount of the input.
149          * @param capacity  The maximum number of distinct input it should track; it defines the
150          *                  lower bound of the output.
151          */
setConfig(final int inputSize, final int capacity)152         public void setConfig(final int inputSize, final int capacity) {
153             if (inputSize < capacity || inputSize <= 1) {
154                 mConfigured = false;
155                 throw new IllegalArgumentException();
156             }
157             reset();
158             mTotalSize = inputSize;
159             mPassSize = inputSize >> 1;
160             mCapacity = capacity;
161             mConfigured = true;
162         }
163 
164         /**
165          * Add a new input to the current sketch.
166          *
167          * @param newInstance The new input
168          */
169         @Override
add(@ullable final T newInstance)170         public void add(@Nullable final T newInstance) {
171             if (!mConfigured) {
172                 throw new IllegalStateException();
173             }
174             if (mNumInputs < mPassSize) {
175                 addToMGSummary(newInstance);
176             } else if (mNumInputs < mTotalSize) {
177                 // Validation pass
178                 validate(newInstance);
179             }
180         }
181 
182         /**
183          * Add an input to the MG summary.
184          *
185          * <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair
186          * in the result set, will have the following property:
187          * <code>(freq - inputSize / capacity) &le; freq' &le; freq</code>
188          * The above freq' is the estimated frequency, while the freq is the actual frequency.
189          * </p>
190          */
addToMGSummary(@ullable final T newInstance)191         private void addToMGSummary(@Nullable final T newInstance) {
192             final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
193             final int index = mObjects.indexOfKey(hashCode);
194             // MG summary
195             if (index >= 0) {
196                 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
197             } else if (mObjects.size() < mCapacity - 1) {
198                 mObjects.put(hashCode, newInstance);
199                 mFrequencies.put(hashCode, 1);
200             } else {
201                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
202                     final int val = mFrequencies.valueAt(i) - 1;
203                     if (val == 0) {
204                         mObjects.removeAt(i);
205                         mFrequencies.removeAt(i);
206                     } else {
207                         mFrequencies.setValueAt(i, val);
208                     }
209                 }
210             }
211             if (++mNumInputs == mPassSize) {
212                 // Clear all the frequencies as we are going to validate them next
213                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
214                     mFrequencies.setValueAt(i, 0);
215                 }
216             }
217         }
218 
219         /**
220          * Validate the results from MG summary; the ones with frequencies less than lower boundary
221          * will be removed from the set.
222          */
validate(@ullable final T newInstance)223         private void validate(@Nullable final T newInstance) {
224             final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
225             final int index = mObjects.indexOfKey(hashCode);
226             if (index >= 0) {
227                 mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
228             }
229             if (++mNumInputs == mTotalSize) {
230                 final int lower = mPassSize / mCapacity;
231                 // Remove any occurrences with frequencies less than lower boundary
232                 for (int i = mFrequencies.size() - 1; i >= 0; i--) {
233                     final int val = mFrequencies.valueAt(i);
234                     if (val < lower) {
235                         mFrequencies.removeAt(i);
236                         mObjects.removeAt(i);
237                     }
238                 }
239             }
240         }
241 
242         /**
243          * @param k      The number of heavy hitters it should return, k &lt; capacity, a value of 0
244          *               will be equivalent to capacity - 1
245          * @param holder The list array into which the elements of the tops are to be stored; a new
246          *               list would be created and returned if this parameter is null.
247          * @param freqs  Optional, the frequencies of each items in the returned result
248          * @return The top K heavy hitters(frequent input)
249          */
250         @Override
251         @Nullable
getTopHeavyHitters(final int k, final @Nullable List<T> holder, final @Nullable List<Float> freqs)252         public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder,
253                 final @Nullable List<Float> freqs) {
254             if (!mConfigured) {
255                 throw new IllegalStateException();
256             }
257 
258             if (k >= mCapacity) {
259                 throw new IllegalArgumentException();
260             }
261 
262             if (mNumInputs < mTotalSize) {
263                 // It hasn't had all the inputs yet.
264                 throw new IllegalStateException();
265             }
266 
267             ArrayList<Integer> indexes = null;
268             for (int i = mFrequencies.size() - 1; i >= 0; i--) {
269                 final int val = mFrequencies.valueAt(i);
270                 if (val > 0) {
271                     if (indexes == null) {
272                         indexes = new ArrayList<>();
273                     }
274                     indexes.add(i);
275                 }
276             }
277             if (indexes == null) {
278                 return null;
279             }
280 
281             Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a));
282 
283             final List<T> result = holder != null ? holder : new ArrayList<T>();
284             final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size());
285             for (int i = 0; i < max; i++) {
286                 final int index = indexes.get(i);
287                 final T obj = mObjects.valueAt(index);
288                 if (obj != null) {
289                     result.add(obj);
290                     if (freqs != null) {
291                         freqs.add((float) mFrequencies.valueAt(index) / mPassSize);
292                     }
293                 }
294             }
295             return result;
296         }
297 
298         /**
299          * @param holder The list array into which the elements of the candidates are to be stored;
300          *               a new list would be created and returned if this parameter is null.
301          * @return The candidate heavy hitters so far, it could include false postives.
302          */
303         @Nullable
getCandidates(final @Nullable List<T> holder)304         public List<T> getCandidates(final @Nullable List<T> holder) {
305             if (!mConfigured) {
306                 throw new IllegalStateException();
307             }
308             if (mNumInputs < mPassSize) {
309                 // It hasn't done with the first pass yet, return nothing
310                 return null;
311             }
312 
313             List<T> result = holder != null ? holder : new ArrayList<T>();
314             for (int i = mObjects.size() - 1; i >= 0; i--) {
315                 final T obj = mObjects.valueAt(i);
316                 if (obj != null) {
317                     result.add(obj);
318                 }
319             }
320             return result;
321         }
322 
323         /**
324          * Reset this heavy hitter counter
325          */
326         @Override
reset()327         public void reset() {
328             mNumInputs = 0;
329             mObjects.clear();
330             mFrequencies.clear();
331         }
332 
333         /**
334          * @return The ratio of the input to be used as the validation data, Float.NaN means no
335          * validation is needed.
336          */
getRequiredValidationInputRatio()337         public float getRequiredValidationInputRatio() {
338             return 0.5f;
339         }
340     }
341 }
342