1/*
2 * Copyright 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import { Request, Response } from 'express';
18import puppeteer = require('puppeteer');
19import { log } from './logger';
20import { ContentNode } from './types';
21import { PlainTextFormatter } from './plain_text_formatter';
22import { transformUrl } from './url-transforms';
23
24const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller'];
25
26// A list of DOM Node types that are usually not useful in the context
27// of fetching text content from the page.
28type BannedNames = {
29  [key: string]: true
30};
31
32/**
33 * Handles the actual license request.
34 */
35export async function handleRequest(request: Request, response: Response) {
36  const url = request.body.url;
37  if (url) {
38    try {
39      log(`Handling license request for ${url}`);
40      if (!isValidProtocol(url)) {
41        response.status(400).send('Invalid request.');
42        return;
43      }
44
45      const nodes = await handleLicenseRequest(url);
46      const content = PlainTextFormatter.plainTextFor(nodes);
47      response.status(200).send(content);
48    } catch (error) {
49      log('Error handling license request ', error);
50      response.status(400).send('Something bad happened. Check the logs');
51    }
52  } else {
53    response.status(400).send('URL required');
54  }
55}
56
57/**
58 * Validates the protocol. Only allows `https?` requests.
59 * @param requestUrl The request url
60 * @return `true` if the protocol is valid.
61 */
62function isValidProtocol(requestUrl: string): boolean {
63  const url = new URL(requestUrl);
64  if (url.protocol === 'https:') {
65    // Allow https requests
66    return true;
67  } else if (url.protocol === 'http:') {
68    // Allow http requests
69    return true;
70  } else {
71    log(`Invalid protocol ${url.protocol}`);
72    return false;
73  }
74}
75
76async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise<ContentNode[]> {
77  const transformed = transformUrl(url);
78  if (url !== transformed) {
79    log(`Transformed request url to ${transformed}`);
80  }
81  const browser = await puppeteer.launch({
82    args: CHROME_LAUNCH_ARGS,
83    devtools: enableLocalDebugging,
84    // https://developer.chrome.com/articles/new-headless/
85    headless: true
86  });
87  const page = await browser.newPage();
88  if (enableLocalDebugging) {
89    page.on('console', (message) => {
90      log(`Puppeteer: ${message.text()}`);
91    });
92  }
93  await page.goto(transformed, { waitUntil: 'domcontentloaded' });
94  const content = await page.evaluate(() => {
95    // A map of banned nodes
96    const BANNED_LOCAL_NAMES: BannedNames = {
97      'button': true,
98      'canvas': true,
99      'footer': true,
100      'header': true,
101      'code': true,
102      'img': true,
103      'nav': true,
104      'script': true,
105      'style': true,
106      'svg': true,
107    };
108
109    // node list handler
110    function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
111      const contentNodes: ContentNode[] = [];
112      if (!list) {
113        return contentNodes;
114      }
115
116      for (let i = 0; i < list.length; i += 1) {
117        const node = contentForNode(list.item(i));
118        if (node) {
119          contentNodes.push(node);
120        }
121      }
122      return contentNodes;
123    }
124
125    // content handler
126    const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
127      if (node.textContent && node.textContent.length > 0) {
128        accumulator.push({ localName: node.localName, textContent: node.textContent });
129      }
130      if (node.children) {
131        for (let i = 0; i < node.children.length; i += 1) {
132          contentWithPath(node.children[i], accumulator);
133        }
134      }
135    };
136
137    // node handler
138    function contentForNode(node: Node | null | undefined) {
139      if (!node) {
140        return null;
141      }
142
143      const name = node.nodeName.toLowerCase();
144      // Check if node is banned.
145      if (name && BANNED_LOCAL_NAMES[name] === true) {
146        return null;
147      }
148      // Shallow clone node, as we are only interested in the textContent
149      // of the node, and not the child nodes.
150      const cloned = node.cloneNode();
151      const localName = name;
152      // Handle elements of different types
153      if (cloned instanceof HTMLAnchorElement) {
154        // anchor element
155        // Ensure that it has reasonable href content
156        const href = cloned.href;
157        if (href.length <= 0 || href === '#') {
158          return null;
159        }
160      }
161      const textContent = cloned.textContent;
162      const children = contentForNodeList(node.childNodes);
163      return {
164        localName: localName,
165        textContent: textContent,
166        children: children
167      };
168    }
169    const body = document.querySelector('body');
170    const nodes: ContentNode[] =
171      body == null ? [] : contentForNodeList(body.childNodes);
172
173    // Accumulate nodes with content
174    const accumulator: ContentNode[] = [];
175    for (let i = 0; i < nodes.length; i += 1) {
176      const node = nodes[i];
177      contentWithPath(node, accumulator);
178    }
179    return accumulator;
180  });
181  await browser.close();
182  return content;
183}
184