• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.research;
18 
19 import android.util.Log;
20 
21 import com.android.inputmethod.annotations.UsedForTesting;
22 import com.android.inputmethod.latin.Dictionary;
23 import com.android.inputmethod.latin.Suggest;
24 import com.android.inputmethod.latin.define.ProductionFlag;
25 
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.LinkedList;
29 
30 /**
31  * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
32  *
33  * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
34  * be logged in enough detail to determine their contents, 2) only a subset of words are logged
35  * in detail, such as 10%, and 3) no numbers are logged.
36  *
37  * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
38  * words, they are added here.  But if the user backs up over their current word to edit a word
39  * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
40  * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
41  * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
42  * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
43  * a user can perform.
44  *
45  * To balance these requirements (keep history so user can edit, flush history so it does not pile
46  * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
47  * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
48  * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
49  * However, the additional non-detailed words are retained, in case the user backspaces to edit
50  * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
51  * as new words arrive.  After enough non-detailed words have been pushed out to account for the
52  * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
53  *
54  * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
55  * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
56  * dictionary words.
57  *
58  * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
59  * n-gram containing dictionary words.
60  */
61 public abstract class MainLogBuffer extends FixedLogBuffer {
62     private static final String TAG = MainLogBuffer.class.getSimpleName();
63     private static final boolean DEBUG = false
64             && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
65 
66     // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
67     public static final int PUBLISHABILITY_PUBLISHABLE = 0;
68     public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
69     public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
70     public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
71     public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
72     public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
73     public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
74 
75     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
76     public static final int N_GRAM_SIZE = 2;
77 
78     // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
79     // method.
80     private final Suggest mSuggest;
81     @UsedForTesting
82     private Dictionary mDictionaryForTesting;
83     private boolean mIsStopping = false;
84 
85     /* package for test */ int mNumWordsBetweenNGrams;
86 
87     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
88     // after a sample is taken.
89     /* package for test */ int mNumWordsUntilSafeToSample;
90 
MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore, final Suggest suggest)91     public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
92             final Suggest suggest) {
93         super(N_GRAM_SIZE + wordsBetweenSamples);
94         mNumWordsBetweenNGrams = wordsBetweenSamples;
95         mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
96         mSuggest = suggest;
97     }
98 
99     @UsedForTesting
setDictionaryForTesting(final Dictionary dictionary)100     /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
101         mDictionaryForTesting = dictionary;
102     }
103 
getDictionary()104     private Dictionary getDictionary() {
105         if (mDictionaryForTesting != null) {
106             return mDictionaryForTesting;
107         }
108         if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
109         return mSuggest.getMainDictionary();
110     }
111 
setIsStopping()112     public void setIsStopping() {
113         mIsStopping = true;
114     }
115 
116     /**
117      * Determines whether the string determined by a series of LogUnits will not violate user
118      * privacy if published.
119      *
120      * @param logUnits a LogUnit list to check for publishability
121      * @param nGramSize the smallest n-gram acceptable to be published.  if
122      * {@link ResearchLogger#IS_LOGGING_EVERYTHING} is true, then publish if there are more than
123      * {@code minNGramSize} words in the logUnits, otherwise wait.  if {@link
124      * ResearchLogger#IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
125      * words in the LogUnits.
126      *
127      * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
128      */
getPublishabilityResultCode(final ArrayList<LogUnit> logUnits, final int nGramSize)129     private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
130             final int nGramSize) {
131         // Bypass privacy checks when debugging.
132         if (ResearchLogger.IS_LOGGING_EVERYTHING) {
133             if (mIsStopping) {
134                 return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
135             }
136             // Only check that it is the right length.  If not, wait for later words to make
137             // complete n-grams.
138             int numWordsInLogUnitList = 0;
139             final int length = logUnits.size();
140             for (int i = 0; i < length; i++) {
141                 final LogUnit logUnit = logUnits.get(i);
142                 numWordsInLogUnitList += logUnit.getNumWords();
143             }
144             if (numWordsInLogUnitList >= nGramSize) {
145                 return PUBLISHABILITY_PUBLISHABLE;
146             } else {
147                 return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
148             }
149         }
150 
151         // Check that we are not sampling too frequently.  Having sampled recently might disclose
152         // too much of the user's intended meaning.
153         if (mNumWordsUntilSafeToSample > 0) {
154             return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
155         }
156         // Reload the dictionary in case it has changed (e.g., because the user has changed
157         // languages).
158         final Dictionary dictionary = getDictionary();
159         if (dictionary == null) {
160             // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
161             // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
162             // contents to potentially pose a privacy risk.
163             return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
164         }
165 
166         // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
167         // the complete buffer contents in detail.
168         int numWordsInLogUnitList = 0;
169         final int length = logUnits.size();
170         for (final LogUnit logUnit : logUnits) {
171             if (!logUnit.hasOneOrMoreWords()) {
172                 // Digits outside words are a privacy threat.
173                 if (logUnit.mayContainDigit()) {
174                     return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
175                 }
176             } else {
177                 numWordsInLogUnitList += logUnit.getNumWords();
178                 final String[] words = logUnit.getWordsAsStringArray();
179                 for (final String word : words) {
180                     // Words not in the dictionary are a privacy threat.
181                     if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
182                         if (DEBUG) {
183                             Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
184                                     + ResearchLogger.hasLetters(word)
185                                     + ", isValid: " + (dictionary.isValidWord(word)));
186                         }
187                         return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
188                     }
189                 }
190             }
191         }
192 
193         // Finally, only return true if the ngram is the right size.
194         if (numWordsInLogUnitList == nGramSize) {
195             return PUBLISHABILITY_PUBLISHABLE;
196         } else {
197             return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
198         }
199     }
200 
shiftAndPublishAll()201     public void shiftAndPublishAll() throws IOException {
202         final LinkedList<LogUnit> logUnits = getLogUnits();
203         while (!logUnits.isEmpty()) {
204             publishLogUnitsAtFrontOfBuffer();
205         }
206     }
207 
208     @Override
onBufferFull()209     protected final void onBufferFull() {
210         try {
211             publishLogUnitsAtFrontOfBuffer();
212         } catch (final IOException e) {
213             if (DEBUG) {
214                 Log.w(TAG, "IOException when publishing front of LogBuffer", e);
215             }
216         }
217     }
218 
219     /**
220      * If there is a safe n-gram at the front of this log buffer, publish it with all details, and
221      * remove the LogUnits that constitute it.
222      *
223      * An n-gram might not be "safe" if it violates privacy controls.  E.g., it might contain
224      * numbers, an out-of-vocabulary word, or another n-gram may have been published recently.  If
225      * there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are
226      * published, but without disclosing any privacy-related details, such as the word the LogUnit
227      * generated, motion data, etc.
228      *
229      * Note that a LogUnit can hold more than one word if the user types without explicit spaces.
230      * In this case, the words may be grouped together in such a way that pulling an n-gram off the
231      * front would require splitting a LogUnit.  Splitting a LogUnit is not possible, so this case
232      * is treated just as the unsafe n-gram case.  This may cause n-grams to be sampled at slightly
233      * less than the target frequency.
234      */
publishLogUnitsAtFrontOfBuffer()235     protected final void publishLogUnitsAtFrontOfBuffer() throws IOException {
236         // TODO: Refactor this method to require fewer passes through the LogUnits.  Should really
237         // require only one pass.
238         ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
239         final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
240         ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
241         if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
242             // Good n-gram at the front of the buffer.  Publish it, disclosing details.
243             publish(logUnits, true /* canIncludePrivateData */);
244             shiftOutWords(N_GRAM_SIZE);
245             mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
246             return;
247         }
248         // No good n-gram at front, and buffer is full.  Shift out up through the first logUnit
249         // with associated words (or if there is none, all the existing logUnits).
250         logUnits.clear();
251         LogUnit logUnit = shiftOut();
252         while (logUnit != null) {
253             logUnits.add(logUnit);
254             final int numWords = logUnit.getNumWords();
255             if (numWords > 0) {
256                 mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords);
257                 break;
258             }
259             logUnit = shiftOut();
260         }
261         publish(logUnits, false /* canIncludePrivateData */);
262     }
263 
264     /**
265      * Called when a list of logUnits should be published.
266      *
267      * It is the subclass's responsibility to implement the publication.
268      *
269      * @param logUnits The list of logUnits to be published.
270      * @param canIncludePrivateData Whether the private data in the logUnits can be included in
271      * publication.
272      *
273      * @throws IOException if publication to the log file is not possible
274      */
publish(final ArrayList<LogUnit> logUnits, final boolean canIncludePrivateData)275     protected abstract void publish(final ArrayList<LogUnit> logUnits,
276             final boolean canIncludePrivateData) throws IOException;
277 
278     @Override
shiftOutWords(final int numWords)279     protected int shiftOutWords(final int numWords) {
280         final int numWordsShiftedOut = super.shiftOutWords(numWords);
281         mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut);
282         if (DEBUG) {
283             Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
284         }
285         return numWordsShiftedOut;
286     }
287 }
288