/* * Copyright 2019 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { Request, Response } from 'express'; import puppeteer = require('puppeteer'); import { log } from './logger'; import { ContentNode } from './types'; import { PlainTextFormatter } from './plain_text_formatter'; import { transformUrl } from './url-transforms'; const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller']; // A list of DOM Node types that are usually not useful in the context // of fetching text content from the page. type BannedNames = { [key: string]: true }; /** * Handles the actual license request. */ export async function handleRequest(request: Request, response: Response) { const url = request.body.url; if (url) { try { log(`Handling license request for ${url}`); if (!isValidProtocol(url)) { response.status(400).send('Invalid request.'); return; } const nodes = await handleLicenseRequest(url); const content = PlainTextFormatter.plainTextFor(nodes); response.status(200).send(content); } catch (error) { log('Error handling license request ', error); response.status(400).send('Something bad happened. Check the logs'); } } else { response.status(400).send('URL required'); } } /** * Validates the protocol. Only allows `https?` requests. * @param requestUrl The request url * @return `true` if the protocol is valid. */ function isValidProtocol(requestUrl: string): boolean { const url = new URL(requestUrl); if (url.protocol === 'https:') { // Allow https requests return true; } else if (url.protocol === 'http:') { // Allow http requests return true; } else { log(`Invalid protocol ${url.protocol}`); return false; } } async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise { const transformed = transformUrl(url); if (url !== transformed) { log(`Transformed request url to ${transformed}`); } const browser = await puppeteer.launch({ args: CHROME_LAUNCH_ARGS, devtools: enableLocalDebugging, // https://developer.chrome.com/articles/new-headless/ headless: true }); const page = await browser.newPage(); if (enableLocalDebugging) { page.on('console', (message) => { log(`Puppeteer: ${message.text()}`); }); } await page.goto(transformed, { waitUntil: 'domcontentloaded' }); const content = await page.evaluate(() => { // A map of banned nodes const BANNED_LOCAL_NAMES: BannedNames = { 'button': true, 'canvas': true, 'footer': true, 'header': true, 'code': true, 'img': true, 'nav': true, 'script': true, 'style': true, 'svg': true, }; // node list handler function contentForNodeList(list: NodeList | null | undefined): ContentNode[] { const contentNodes: ContentNode[] = []; if (!list) { return contentNodes; } for (let i = 0; i < list.length; i += 1) { const node = contentForNode(list.item(i)); if (node) { contentNodes.push(node); } } return contentNodes; } // content handler const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) { if (node.textContent && node.textContent.length > 0) { accumulator.push({ localName: node.localName, textContent: node.textContent }); } if (node.children) { for (let i = 0; i < node.children.length; i += 1) { contentWithPath(node.children[i], accumulator); } } }; // node handler function contentForNode(node: Node | null | undefined) { if (!node) { return null; } const name = node.nodeName.toLowerCase(); // Check if node is banned. if (name && BANNED_LOCAL_NAMES[name] === true) { return null; } // Shallow clone node, as we are only interested in the textContent // of the node, and not the child nodes. const cloned = node.cloneNode(); const localName = name; // Handle elements of different types if (cloned instanceof HTMLAnchorElement) { // anchor element // Ensure that it has reasonable href content const href = cloned.href; if (href.length <= 0 || href === '#') { return null; } } const textContent = cloned.textContent; const children = contentForNodeList(node.childNodes); return { localName: localName, textContent: textContent, children: children }; } const body = document.querySelector('body'); const nodes: ContentNode[] = body == null ? [] : contentForNodeList(body.childNodes); // Accumulate nodes with content const accumulator: ContentNode[] = []; for (let i = 0; i < nodes.length; i += 1) { const node = nodes[i]; contentWithPath(node, accumulator); } return accumulator; }); await browser.close(); return content; }