• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
6 #define COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
7 
8 #include <map>
9 #include <string>
10 
11 #include "base/callback.h"
12 #include "base/containers/hash_tables.h"
13 #include "base/memory/ref_counted.h"
14 #include "base/memory/scoped_ptr.h"
15 #include "base/memory/scoped_vector.h"
16 #include "base/memory/weak_ptr.h"
17 #include "components/dom_distiller/core/article_distillation_update.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "net/url_request/url_request_context_getter.h"
22 #include "url/gurl.h"
23 
24 namespace dom_distiller {
25 
26 class DistillerImpl;
27 
28 class Distiller {
29  public:
30   typedef base::Callback<void(scoped_ptr<DistilledArticleProto>)>
31       DistillationFinishedCallback;
32   typedef base::Callback<void(const ArticleDistillationUpdate&)>
33       DistillationUpdateCallback;
34 
~Distiller()35   virtual ~Distiller() {}
36 
37   // Distills a page, and asynchronously returns the article HTML to the
38   // supplied |finished_cb| callback. |update_cb| is invoked whenever article
39   // under distillation is updated with more data.
40   // E.g. when distilling a 2 page article, |update_cb| may be invoked each time
41   // a distilled page is added and |finished_cb| will be invoked once
42   // distillation is completed.
43   virtual void DistillPage(const GURL& url,
44                            scoped_ptr<DistillerPage> distiller_page,
45                            const DistillationFinishedCallback& finished_cb,
46                            const DistillationUpdateCallback& update_cb) = 0;
47 };
48 
49 class DistillerFactory {
50  public:
51   virtual scoped_ptr<Distiller> CreateDistiller() = 0;
~DistillerFactory()52   virtual ~DistillerFactory() {}
53 };
54 
55 // Factory for creating a Distiller.
56 class DistillerFactoryImpl : public DistillerFactory {
57  public:
58   DistillerFactoryImpl(
59       scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
60       const dom_distiller::proto::DomDistillerOptions& dom_distiller_options);
61   virtual ~DistillerFactoryImpl();
62   virtual scoped_ptr<Distiller> CreateDistiller() OVERRIDE;
63 
64  private:
65   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
66   dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
67 };
68 
69 // Distills a article from a page and associated pages.
70 class DistillerImpl : public Distiller {
71  public:
72   DistillerImpl(
73       const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
74       const dom_distiller::proto::DomDistillerOptions& dom_distiller_options);
75   virtual ~DistillerImpl();
76 
77   virtual void DistillPage(
78       const GURL& url,
79       scoped_ptr<DistillerPage> distiller_page,
80       const DistillationFinishedCallback& finished_cb,
81       const DistillationUpdateCallback& update_cb) OVERRIDE;
82 
83   void SetMaxNumPagesInArticle(size_t max_num_pages);
84 
85  private:
86   // In case of multiple pages, the Distiller maintains state of multiple pages
87   // as page numbers relative to the page number where distillation started.
88   // E.g. if distillation starts at page 2 for a 3 page article. The relative
89   // page numbers assigned to pages will be [-1,0,1].
90 
91   // Class representing the state of a page under distillation.
92   struct DistilledPageData {
93     DistilledPageData();
94     virtual ~DistilledPageData();
95     // Relative page number of the page.
96     int page_num;
97     ScopedVector<DistillerURLFetcher> image_fetchers_;
98     scoped_refptr<base::RefCountedData<DistilledPageProto> >
99         distilled_page_proto;
100 
101    private:
102     DISALLOW_COPY_AND_ASSIGN(DistilledPageData);
103   };
104 
105   void OnFetchImageDone(int page_num,
106                         DistillerURLFetcher* url_fetcher,
107                         const std::string& id,
108                         const std::string& response);
109 
110   void OnPageDistillationFinished(int page_num,
111                                   const GURL& page_url,
112                                   scoped_ptr<DistilledPageInfo> distilled_page,
113                                   bool distillation_successful);
114 
115   virtual void FetchImage(int page_num,
116                           const std::string& image_id,
117                           const std::string& item);
118 
119   // Distills the next page.
120   void DistillNextPage();
121 
122   // Adds the |url| to |pages_to_be_distilled| if |page_num| is a valid relative
123   // page number and |url| is valid. Ignores duplicate pages and urls.
124   void AddToDistillationQueue(int page_num, const GURL& url);
125 
126   // Check if |page_num| is a valid relative page number, i.e. page with
127   // |page_num| is either under distillation or has already completed
128   // distillation.
129   bool IsPageNumberInUse(int page_num) const;
130 
131   bool AreAllPagesFinished() const;
132 
133   // Total number of pages in the article that the distiller knows of, this
134   // includes pages that are pending distillation.
135   size_t TotalPageCount() const;
136 
137   // Runs |finished_cb_| if all distillation callbacks and image fetches are
138   // complete.
139   void RunDistillerCallbackIfDone();
140 
141   // Checks if page |distilled_page_data| has finished distillation, including
142   // all image fetches.
143   void AddPageIfDone(int page_num);
144 
145   DistilledPageData* GetPageAtIndex(size_t index) const;
146 
147   // Create an ArticleDistillationUpdate for the current distillation
148   // state.
149   const ArticleDistillationUpdate CreateDistillationUpdate() const;
150 
151   const DistillerURLFetcherFactory& distiller_url_fetcher_factory_;
152   scoped_ptr<DistillerPage> distiller_page_;
153 
154   dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
155   DistillationFinishedCallback finished_cb_;
156   DistillationUpdateCallback update_cb_;
157 
158   // Set of pages that are under distillation or have finished distillation.
159   // |started_pages_index_| and |finished_pages_index_| maintains the mapping
160   // from page number to the indices in |pages_|.
161   ScopedVector<DistilledPageData> pages_;
162 
163   // Maps page numbers of finished pages to the indices in |pages_|.
164   std::map<int, size_t> finished_pages_index_;
165 
166   // Maps page numbers of pages under distillation to the indices in |pages_|.
167   // If a page is |started_pages_| that means it is still waiting for an action
168   // (distillation or image fetch) to finish.
169   base::hash_map<int, size_t> started_pages_index_;
170 
171   // The list of pages that are still waiting for distillation to start.
172   // This is a map, to make distiller prefer distilling lower page numbers
173   // first.
174   std::map<int, GURL> waiting_pages_;
175 
176   // Set to keep track of which urls are already seen by the distiller. Used to
177   // prevent distiller from distilling the same url twice.
178   base::hash_set<std::string> seen_urls_;
179 
180   size_t max_pages_in_article_;
181 
182   bool destruction_allowed_;
183 
184   base::WeakPtrFactory<DistillerImpl> weak_factory_;
185 
186   DISALLOW_COPY_AND_ASSIGN(DistillerImpl);
187 };
188 
189 }  // namespace dom_distiller
190 
191 #endif  // COMPONENTS_DOM_DISTILLER_CORE_DISTILLER_H_
192