• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingUrlFeatureExtractor handles computing URL-based features for
6 // the client-side phishing detection model.  These include tokens in the
7 // host and path, features pertaining to host length, and IP addresses.
8 
9 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
10 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
11 
12 #include <string>
13 #include <vector>
14 
15 #include "base/basictypes.h"
16 
17 class GURL;
18 
19 namespace safe_browsing {
20 class FeatureMap;
21 
22 class PhishingUrlFeatureExtractor {
23  public:
24   PhishingUrlFeatureExtractor();
25   ~PhishingUrlFeatureExtractor();
26 
27   // Extracts features for |url| into the given feature map.
28   // Returns true on success.
29   bool ExtractFeatures(const GURL& url, FeatureMap* features);
30 
31  private:
32   friend class PhishingUrlFeatureExtractorTest;
33 
34   static const size_t kMinPathComponentLength = 3;
35 
36   // Given a string, finds all substrings of consecutive alphanumeric
37   // characters of length >= kMinPathComponentLength and inserts them into
38   // tokens.
39   static void SplitStringIntoLongAlphanumTokens(
40       const std::string& full,
41       std::vector<std::string>* tokens);
42 
43   DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor);
44 };
45 
46 }  // namespace safe_browsing
47 
48 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
49