• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This proto represents a machine learning model which is used to compute
6// the probability that a particular page visited by Chrome is phishing.
7//
8// Note: sine the machine learning model is trained on the server-side and then
9// downloaded onto the client it is important that this proto file stays in
10// sync with the server-side copy.  Otherwise, the client may not be able to
11// parse the server generated model anymore.  If you want to change this
12// protocol definition or you have questions regarding its format please contact
13// chrome-anti-phishing@googlegroups.com.
14
15syntax = "proto2";
16
17option optimize_for = LITE_RUNTIME;
18
19package safe_browsing;
20
21// This protocol buffer represents a machine learning model that is used in
22// client-side phishing detection (in Chrome).  The client extracts a set
23// of features from every website the user visits.  Extracted features map
24// feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
25//
26// To compute the phishing score (i.e., the probability that the website is
27// phishing) a scorer will simply compute the sum of all rule scores for a
28// given set of extracted features.  The score of a particular rule corresponds
29// to the product of all feature values that are part of the rule times the
30// rule weight.  If a feature has no value (i.e., is not part of the extracted
31// features) its value will be set to zero.  The overall score is computed
32// by summing up all the rule scores.  This overall score is a logodds and can
33// be converted to a probability like this:
34// p = exp(logodds) / (exp(logodds) + 1).
35//
36// To make it harder for phishers to reverse engineer our machine learning model
37// all the features in the model are hashed with a sha256 hash function.  The
38// feature extractors also hash the extracted features before scoring happens.
39message ClientSideModel {
40  // In order to save some space we store all the hashed strings in a
41  // single repeated field and then the rules as well as page terms
42  // and page words refer to an index in that repeated field.  All
43  // hashes are sha256 hashes stored in binary format.
44  repeated bytes hashes = 1;
45
46  message Rule {
47    // List of indexes into hashes above which are basically hashed
48    // features that form the current rule.
49    repeated int32 feature = 1;
50
51    // The weight for this particular rule.
52    required float weight = 2;
53  }
54
55  // List of rules which make up the model
56  repeated Rule rule = 2;
57
58  // List of indexes that point to the hashed page terms that appear in
59  // the model.  The hashes are computed over page terms that are encoded
60  // as lowercase UTF-8 strings.
61  repeated int32 page_term = 3;
62
63  // List of hashed page words.  The page words correspond to all words that
64  // appear in page terms.  If the term "one two" is in the list of page terms
65  // then "one" and "two" will be in the list of page words.  For page words
66  // we don't use SHA256 because it is too expensive.  We use MurmurHash3
67  // instead.  See: http://code.google.com/p/smhasher.
68  repeated fixed32 page_word = 4;
69
70  // Page terms in page_term contain at most this many page words.
71  required int32 max_words_per_term = 5;
72
73  // Model version number.  Every model that we train should have a different
74  // version number and it should always be larger than the previous model
75  // version.
76  optional int32 version = 6;
77
78  // List of known bad IP subnets.
79  message IPSubnet {
80    // The subnet prefix is a valid 16-byte IPv6 address (in network order) that
81    // is hashed using sha256.
82    required bytes prefix = 1;
83
84    // Network prefix size in bits.  Default is an exact-host match.
85    optional int32 size = 2 [default = 128];
86  };
87  repeated IPSubnet bad_subnet = 7;
88
89  // Murmur hash seed that was used to hash the page words.
90  optional fixed32 murmur_hash_seed = 8;
91
92  // Maximum number of unique shingle hashes per page.
93  optional int32 max_shingles_per_page = 9 [default = 200];
94
95  // The number of words in a shingle.
96  optional int32 shingle_size = 10 [default = 4];
97}
98