1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <sstream>
6
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h"
11 #include "base/run_loop.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_split.h"
14 #include "components/dom_distiller/content/distiller_page_web_contents.h"
15 #include "components/dom_distiller/core/article_entry.h"
16 #include "components/dom_distiller/core/distilled_page_prefs.h"
17 #include "components/dom_distiller/core/distiller.h"
18 #include "components/dom_distiller/core/dom_distiller_service.h"
19 #include "components/dom_distiller/core/dom_distiller_store.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "components/dom_distiller/core/task_tracker.h"
23 #include "components/leveldb_proto/proto_database.h"
24 #include "components/leveldb_proto/proto_database_impl.h"
25 #include "components/pref_registry/testing_pref_service_syncable.h"
26 #include "content/public/browser/browser_context.h"
27 #include "content/public/browser/browser_thread.h"
28 #include "content/public/test/content_browser_test.h"
29 #include "content/shell/browser/shell.h"
30 #include "google/protobuf/io/coded_stream.h"
31 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
32 #include "net/dns/mock_host_resolver.h"
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
34 #include "ui/base/resource/resource_bundle.h"
35
36 using content::ContentBrowserTest;
37
38 namespace dom_distiller {
39
40 namespace {
41
42 // The url to distill.
43 const char* kUrlSwitch = "url";
44
45 // A space-separated list of urls to distill.
46 const char* kUrlsSwitch = "urls";
47
48 // Indicates that DNS resolution should be disabled for this test.
49 const char* kDisableDnsSwitch = "disable-dns";
50
51 // Will write the distilled output to the given file instead of to stdout.
52 const char* kOutputFile = "output-file";
53
54 // Indicates to output a serialized protocol buffer instead of human-readable
55 // output.
56 const char* kShouldOutputBinary = "output-binary";
57
58 // Indicates to output only the text of the article and not the enclosing html.
59 const char* kExtractTextOnly = "extract-text-only";
60
61 // Indicates to include debug output.
62 const char* kDebugLevel = "debug-level";
63
64 // Maximum number of concurrent started extractor requests.
65 const int kMaxExtractorTasks = 8;
66
CreateDomDistillerService(content::BrowserContext * context,const base::FilePath & db_path)67 scoped_ptr<DomDistillerService> CreateDomDistillerService(
68 content::BrowserContext* context,
69 const base::FilePath& db_path) {
70 scoped_refptr<base::SequencedTaskRunner> background_task_runner =
71 content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
72 content::BrowserThread::GetBlockingPool()->GetSequenceToken());
73
74 // TODO(cjhopman): use an in-memory database instead of an on-disk one with
75 // temporary directory.
76 scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
77 new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
78 background_task_runner));
79 scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
80 db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path));
81
82 scoped_ptr<DistillerPageFactory> distiller_page_factory(
83 new DistillerPageWebContentsFactory(context));
84 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
85 new DistillerURLFetcherFactory(context->GetRequestContext()));
86
87 dom_distiller::proto::DomDistillerOptions options;
88 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
89 options.set_extract_text_only(true);
90 }
91 int debug_level = 0;
92 if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
93 base::StringToInt(
94 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
95 kDebugLevel),
96 &debug_level)) {
97 options.set_debug_level(debug_level);
98 }
99 scoped_ptr<DistillerFactory> distiller_factory(
100 new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options));
101
102 // Setting up PrefService for DistilledPagePrefs.
103 user_prefs::TestingPrefServiceSyncable* pref_service =
104 new user_prefs::TestingPrefServiceSyncable();
105 DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
106
107 return scoped_ptr<DomDistillerService>(new DomDistillerService(
108 dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
109 distiller_factory.Pass(),
110 distiller_page_factory.Pass(),
111 scoped_ptr<DistilledPagePrefs>(
112 new DistilledPagePrefs(pref_service))));
113 }
114
AddComponentsResources()115 void AddComponentsResources() {
116 base::FilePath pak_file;
117 base::FilePath pak_dir;
118 PathService::Get(base::DIR_MODULE, &pak_dir);
119 pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
120 ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
121 pak_file, ui::SCALE_FACTOR_NONE);
122 }
123
WriteProtobufWithSize(const google::protobuf::MessageLite & message,google::protobuf::io::ZeroCopyOutputStream * output_stream)124 bool WriteProtobufWithSize(
125 const google::protobuf::MessageLite& message,
126 google::protobuf::io::ZeroCopyOutputStream* output_stream) {
127 google::protobuf::io::CodedOutputStream coded_output(output_stream);
128
129 // Write the size.
130 const int size = message.ByteSize();
131 coded_output.WriteLittleEndian32(size);
132 message.SerializeWithCachedSizes(&coded_output);
133 return !coded_output.HadError();
134 }
135
GetReadableArticleString(const DistilledArticleProto & article_proto)136 std::string GetReadableArticleString(
137 const DistilledArticleProto& article_proto) {
138 std::stringstream output;
139 output << "Article Title: " << article_proto.title() << std::endl;
140 output << "# of pages: " << article_proto.pages_size() << std::endl;
141 for (int i = 0; i < article_proto.pages_size(); ++i) {
142 const DistilledPageProto& page = article_proto.pages(i);
143 output << "Page " << i << std::endl;
144 output << "URL: " << page.url() << std::endl;
145 output << "Content: " << page.html() << std::endl;
146 if (page.has_debug_info() && page.debug_info().has_log())
147 output << "Log: " << page.debug_info().log() << std::endl;
148 }
149 return output.str();
150 }
151
152 } // namespace
153
154 class ContentExtractionRequest : public ViewRequestDelegate {
155 public:
Start(DomDistillerService * service,const gfx::Size & render_view_size,base::Closure finished_callback)156 void Start(DomDistillerService* service, const gfx::Size& render_view_size,
157 base::Closure finished_callback) {
158 finished_callback_ = finished_callback;
159 viewer_handle_ =
160 service->ViewUrl(this,
161 service->CreateDefaultDistillerPage(render_view_size),
162 url_);
163 }
164
GetArticleCopy()165 DistilledArticleProto GetArticleCopy() {
166 return *article_proto_;
167 }
168
CreateForCommandLine(const CommandLine & command_line)169 static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
170 const CommandLine& command_line) {
171 ScopedVector<ContentExtractionRequest> requests;
172 if (command_line.HasSwitch(kUrlSwitch)) {
173 GURL url;
174 std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
175 url = GURL(url_string);
176 if (url.is_valid()) {
177 requests.push_back(new ContentExtractionRequest(url));
178 }
179 } else if (command_line.HasSwitch(kUrlsSwitch)) {
180 std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
181 std::vector<std::string> urls;
182 base::SplitString(urls_string, ' ', &urls);
183 for (size_t i = 0; i < urls.size(); ++i) {
184 GURL url(urls[i]);
185 if (url.is_valid()) {
186 requests.push_back(new ContentExtractionRequest(url));
187 } else {
188 ADD_FAILURE() << "Bad url";
189 }
190 }
191 }
192 if (requests.empty()) {
193 ADD_FAILURE() << "No valid url provided";
194 }
195
196 return requests.Pass();
197 }
198
199 private:
ContentExtractionRequest(const GURL & url)200 ContentExtractionRequest(const GURL& url) : url_(url) {}
201
OnArticleUpdated(ArticleDistillationUpdate article_update)202 virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
203 OVERRIDE {}
204
OnArticleReady(const DistilledArticleProto * article_proto)205 virtual void OnArticleReady(const DistilledArticleProto* article_proto)
206 OVERRIDE {
207 article_proto_ = article_proto;
208 CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
209 base::MessageLoop::current()->PostTask(
210 FROM_HERE,
211 finished_callback_);
212 }
213
214 const DistilledArticleProto* article_proto_;
215 scoped_ptr<ViewerHandle> viewer_handle_;
216 GURL url_;
217 base::Closure finished_callback_;
218 };
219
220 class ContentExtractor : public ContentBrowserTest {
221 public:
ContentExtractor()222 ContentExtractor()
223 : pending_tasks_(0),
224 max_tasks_(kMaxExtractorTasks),
225 next_request_(0),
226 output_data_(),
227 protobuf_output_stream_(
228 new google::protobuf::io::StringOutputStream(&output_data_)) {}
229
230 // Change behavior of the default host resolver to avoid DNS lookup errors, so
231 // we can make network calls.
SetUpOnMainThread()232 virtual void SetUpOnMainThread() OVERRIDE {
233 if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
234 EnableDNSLookupForThisTest();
235 }
236 CHECK(db_dir_.CreateUniqueTempDir());
237 AddComponentsResources();
238 }
239
TearDownOnMainThread()240 virtual void TearDownOnMainThread() OVERRIDE {
241 DisableDNSLookupForThisTest();
242 }
243
244 protected:
245 // Creates the DomDistillerService and creates and starts the extraction
246 // request.
Start()247 void Start() {
248 content::BrowserContext* context =
249 shell()->web_contents()->GetBrowserContext();
250 service_ = CreateDomDistillerService(context,
251 db_dir_.path());
252 const CommandLine& command_line = *CommandLine::ForCurrentProcess();
253 requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
254 PumpQueue();
255 }
256
PumpQueue()257 void PumpQueue() {
258 while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
259 requests_[next_request_]->Start(
260 service_.get(),
261 shell()->web_contents()->GetContainerBounds().size(),
262 base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
263 ++next_request_;
264 ++pending_tasks_;
265 }
266 }
267
268 private:
269 // Change behavior of the default host resolver to allow DNS lookup
270 // to proceed instead of being blocked by the test infrastructure.
EnableDNSLookupForThisTest()271 void EnableDNSLookupForThisTest() {
272 // mock_host_resolver_override_ takes ownership of the resolver.
273 scoped_refptr<net::RuleBasedHostResolverProc> resolver =
274 new net::RuleBasedHostResolverProc(host_resolver());
275 resolver->AllowDirectLookup("*");
276 mock_host_resolver_override_.reset(
277 new net::ScopedDefaultHostResolverProc(resolver.get()));
278 }
279
280 // We need to reset the DNS lookup when we finish, or the test will fail.
DisableDNSLookupForThisTest()281 void DisableDNSLookupForThisTest() {
282 mock_host_resolver_override_.reset();
283 }
284
FinishRequest()285 void FinishRequest() {
286 --pending_tasks_;
287 if (next_request_ == requests_.size() && pending_tasks_ == 0) {
288 Finish();
289 } else {
290 PumpQueue();
291 }
292 }
293
DoArticleOutput()294 void DoArticleOutput() {
295 for (size_t i = 0; i < requests_.size(); ++i) {
296 const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
297 if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
298 WriteProtobufWithSize(article, protobuf_output_stream_.get());
299 } else {
300 output_data_ += GetReadableArticleString(article) + "\n";
301 }
302 }
303
304 if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
305 base::FilePath filename =
306 CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
307 ASSERT_EQ(
308 (int)output_data_.size(),
309 base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
310 } else {
311 VLOG(0) << output_data_;
312 }
313 }
314
Finish()315 void Finish() {
316 DoArticleOutput();
317 requests_.clear();
318 service_.reset();
319 base::MessageLoop::current()->PostTask(
320 FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
321 }
322
323 size_t pending_tasks_;
324 size_t max_tasks_;
325 size_t next_request_;
326
327 base::ScopedTempDir db_dir_;
328 scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
329 scoped_ptr<DomDistillerService> service_;
330 ScopedVector<ContentExtractionRequest> requests_;
331
332 std::string output_data_;
333 scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
334 };
335
IN_PROC_BROWSER_TEST_F(ContentExtractor,MANUAL_ExtractUrl)336 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
337 Start();
338 base::RunLoop().Run();
339 }
340
341 } // namespace dom_distiller
342