1/* 2 * Copyright 2019 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17import { Request, Response } from 'express'; 18import puppeteer = require('puppeteer'); 19import { log } from './logger'; 20import { ContentNode } from './types'; 21import { PlainTextFormatter } from './plain_text_formatter'; 22import { transformUrl } from './url-transforms'; 23 24const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller']; 25 26// A list of DOM Node types that are usually not useful in the context 27// of fetching text content from the page. 28type BannedNames = { 29 [key: string]: true 30}; 31 32/** 33 * Handles the actual license request. 34 */ 35export async function handleRequest(request: Request, response: Response) { 36 const url = request.body.url; 37 if (url) { 38 try { 39 log(`Handling license request for ${url}`); 40 if (!isValidProtocol(url)) { 41 response.status(400).send('Invalid request.'); 42 return; 43 } 44 45 const nodes = await handleLicenseRequest(url); 46 const content = PlainTextFormatter.plainTextFor(nodes); 47 response.status(200).send(content); 48 } catch (error) { 49 log('Error handling license request ', error); 50 response.status(400).send('Something bad happened. Check the logs'); 51 } 52 } else { 53 response.status(400).send('URL required'); 54 } 55} 56 57/** 58 * Validates the protocol. Only allows `https?` requests. 59 * @param requestUrl The request url 60 * @return `true` if the protocol is valid. 61 */ 62function isValidProtocol(requestUrl: string): boolean { 63 const url = new URL(requestUrl); 64 if (url.protocol === 'https:') { 65 // Allow https requests 66 return true; 67 } else if (url.protocol === 'http:') { 68 // Allow http requests 69 return true; 70 } else { 71 log(`Invalid protocol ${url.protocol}`); 72 return false; 73 } 74} 75 76async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise<ContentNode[]> { 77 const transformed = transformUrl(url); 78 if (url !== transformed) { 79 log(`Transformed request url to ${transformed}`); 80 } 81 const browser = await puppeteer.launch({ 82 args: CHROME_LAUNCH_ARGS, 83 devtools: enableLocalDebugging, 84 // https://developer.chrome.com/articles/new-headless/ 85 headless: true 86 }); 87 const page = await browser.newPage(); 88 if (enableLocalDebugging) { 89 page.on('console', (message) => { 90 log(`Puppeteer: ${message.text()}`); 91 }); 92 } 93 await page.goto(transformed, { waitUntil: 'domcontentloaded' }); 94 const content = await page.evaluate(() => { 95 // A map of banned nodes 96 const BANNED_LOCAL_NAMES: BannedNames = { 97 'button': true, 98 'canvas': true, 99 'footer': true, 100 'header': true, 101 'code': true, 102 'img': true, 103 'nav': true, 104 'script': true, 105 'style': true, 106 'svg': true, 107 }; 108 109 // node list handler 110 function contentForNodeList(list: NodeList | null | undefined): ContentNode[] { 111 const contentNodes: ContentNode[] = []; 112 if (!list) { 113 return contentNodes; 114 } 115 116 for (let i = 0; i < list.length; i += 1) { 117 const node = contentForNode(list.item(i)); 118 if (node) { 119 contentNodes.push(node); 120 } 121 } 122 return contentNodes; 123 } 124 125 // content handler 126 const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) { 127 if (node.textContent && node.textContent.length > 0) { 128 accumulator.push({ localName: node.localName, textContent: node.textContent }); 129 } 130 if (node.children) { 131 for (let i = 0; i < node.children.length; i += 1) { 132 contentWithPath(node.children[i], accumulator); 133 } 134 } 135 }; 136 137 // node handler 138 function contentForNode(node: Node | null | undefined) { 139 if (!node) { 140 return null; 141 } 142 143 const name = node.nodeName.toLowerCase(); 144 // Check if node is banned. 145 if (name && BANNED_LOCAL_NAMES[name] === true) { 146 return null; 147 } 148 // Shallow clone node, as we are only interested in the textContent 149 // of the node, and not the child nodes. 150 const cloned = node.cloneNode(); 151 const localName = name; 152 // Handle elements of different types 153 if (cloned instanceof HTMLAnchorElement) { 154 // anchor element 155 // Ensure that it has reasonable href content 156 const href = cloned.href; 157 if (href.length <= 0 || href === '#') { 158 return null; 159 } 160 } 161 const textContent = cloned.textContent; 162 const children = contentForNodeList(node.childNodes); 163 return { 164 localName: localName, 165 textContent: textContent, 166 children: children 167 }; 168 } 169 const body = document.querySelector('body'); 170 const nodes: ContentNode[] = 171 body == null ? [] : contentForNodeList(body.childNodes); 172 173 // Accumulate nodes with content 174 const accumulator: ContentNode[] = []; 175 for (let i = 0; i < nodes.length; i += 1) { 176 const node = nodes[i]; 177 contentWithPath(node, accumulator); 178 } 179 return accumulator; 180 }); 181 await browser.close(); 182 return content; 183} 184