offlinify_dackka_docs.py - OpenGrok cross reference for /development/offlinifyDocs/offlinify_dackka_docs.py

#!/usr/bin/env python3

from argparse import ArgumentParser
import os
from pathlib import Path
from re import search
from shutil import copyfile, rmtree
from sys import exit

"""
Script which takes in Dackka docs and produces offline docs with CSS and relative links.
Run `python3 offlinify_dackka_docs.py --help` for argument descriptions.
"""

SCRIPT_PATH = Path(__file__).parent.absolute()
REL_PATH_TO_DOCS = '../../../../out/androidx/docs-tip-of-tree/build'
DEFAULT_INPUT  = os.path.abspath(os.path.join(SCRIPT_PATH, REL_PATH_TO_DOCS, 'docs'))
DEFAULT_OUTPUT = os.path.abspath(os.path.join(SCRIPT_PATH, REL_PATH_TO_DOCS, 'offlineDocs'))
REL_PATH_TO_LIBRARIES = 'reference/kotlin/androidx'
STYLE_FILENAME = 'style.css'
CSS_SOURCE_PATH = os.path.join(SCRIPT_PATH, STYLE_FILENAME)
PUBLISHED_DOCS_URL = 'https://developer.android.com'
INDEX_PAGES = ['classes.html', 'packages.html']

"""
Check environment and args, then create offline docs.
"""
def main():
  check_env()

  args = parse_args()
  input_path = check_input_path(args.input)
  output_path = check_output_path(args.output)
  library = check_library(args.library, input_path, output_path)

  process_input(input_path, output_path, library)

"""
Error early if any system setup is missing.
"""
def check_env():
  try:
    from bs4 import BeautifulSoup
  except ModuleNotFoundError:
    print('ERROR: This script requires beatifulsoup module `bs4` to run.')
    print('Please install with pip or another package manager.')
    exit(-1)

"""
Parses script args.
"""
def parse_args():
  parser = ArgumentParser(
    description='Converts Dackka docs to an offline version by adding CSS, fixing links, and ' \
        'removing book.yaml templating.'
  )
  parser.add_argument(
    '--input', required=False,
    help='Path to generated Dackka docs. This directory is expected to contain a `reference` ' \
         f'subdirectory. If no path is provided, {DEFAULT_INPUT} is used by default.')
  parser.add_argument(
    '--output', required=False,
    help='Path to store output offline docs. If a directory already exists at this path, it will' \
          f' be deleted. If no path is provided, {DEFAULT_OUTPUT} is used by default.'
  )
  parser.add_argument(
    '--library', required=False,
    help='Specific androidx library to convert docs for. Docs for this library are expected to be' \
          f' in a subdirectory of `{REL_PATH_TO_LIBRARIES}` within the input path. '\
          'If no library is provided, docs for all libraries are converted to offline mode.'
  )
  return parser.parse_args()

"""
Verify the provided input arg is a valid directory.
"""
def check_input_path(path):
  if path is None:
    if not os.path.exists(DEFAULT_INPUT):
      print(f'ERROR: Default input path `{DEFAULT_INPUT}` does not exist. Generate docs by running')
      print('    ./gradlew docs')
      exit(-1)
    return DEFAULT_INPUT

  path = os.path.normpath(path)
  if not os.path.exists(path):
    print(f'ERROR: Provided input path `{path}` does not exist.')
    exit(-1)

  if not os.path.isdir(path):
    print(f'ERROR: Provided input path `{path} does not point to a directory.')
    exit(-1)

  return path

"""
Verifies the output arg by creating a directory at the path, removing existing directory if needed.
"""
def check_output_path(path):
  if path is None:
    path = DEFAULT_OUTPUT

  if os.path.exists(path):
    if os.path.isdir(path):
      print(f'Removing existing directory at output path {path}')
      rmtree(path)
    else:
      print(f'ERROR: output path {path} exists but is not a directory.')
      exit(-1)

  os.makedirs(path)
  return path

"""
Verify the library arg by ensuring the input docs directory exists and making output directories.
"""
def check_library(library, input_path, output_path):
  if library is None:
    return None

  rel_library_path = os.path.join(REL_PATH_TO_LIBRARIES, library)
  input_library_path = os.path.join(input_path, rel_library_path)

  if not os.path.exists(input_library_path):
    print(f'ERROR: Docs directory for library {library} could not be found at')
    print(f'    {input_library_path}')
    exit(-1)

  os.makedirs(os.path.join(output_path, rel_library_path))

  return rel_library_path

"""
Fixes all HTML files in the input directory (or just the pages for the library if a specific one is
provided) and puts the new pages in the output directory.
"""
def process_input(input_path, output_path, library):
  css_path = os.path.join(output_path, STYLE_FILENAME)
  copyfile(CSS_SOURCE_PATH, css_path)

  # Go through just the subdirectory for the specific library, or for the entire input directory.
  path_to_walk = os.path.join(input_path, library) if library is not None else input_path
  for root, dirs, files in os.walk(path_to_walk):
    rel_root = os.path.relpath(root, start=input_path)
    output_root = os.path.join(output_path, rel_root)

    for name in dirs:
      os.mkdir(os.path.join(output_root, name))

    for name in files:
      _, ext = os.path.splitext(name)
      input_file_path = os.path.join(root, name)
      output_file_path = os.path.join(output_root, name)
      rel_css_path = os.path.relpath(css_path, start=output_root)
      if ext == '.html':
        fix_html_file(input_file_path, input_path, output_file_path, rel_css_path, library, False)

  if library is not None:
    # In addition to the library pages, copy over package and class index pages.
    base_output_dir = os.path.join(output_path, REL_PATH_TO_LIBRARIES)
    rel_css_path = os.path.relpath(css_path, start = base_output_dir)
    for file in INDEX_PAGES:
      input_file_path = os.path.join(input_path, REL_PATH_TO_LIBRARIES, file)
      if os.path.exists(input_file_path):
        output_file_path = os.path.join(base_output_dir, file)
        fix_html_file(input_file_path, input_path, output_file_path, rel_css_path, library, True)

"""
Performs all fixes to the input HTML file and saves the resulting HTML at the output path.
"""
def fix_html_file(file_path, root_input_path, output_file_path, css_path, library, index_page):
  from bs4 import BeautifulSoup

  with open(file_path, 'r') as f:
    parsed = BeautifulSoup(f, 'html.parser')

  if index_page:
    filter_index(parsed, library)

  remove_book_template_strings(parsed)
  add_css(parsed, css_path)
  fix_links(parsed, file_path, root_input_path, library)

  with open(output_file_path, 'w') as f:
    f.write(str(parsed))

"""
Removes template strings containing book.yaml information for DAC.
"""
def remove_book_template_strings(page):
  # page.find_all wasn't working here because the template strings are not within HTML tags.
  for element in page.head.contents:
    if search('{%.*%}', element.text):
      element.extract()

"""
Replace any CSS links with a correct link.
"""
def add_css(page, relative_css):
  for tag in page.find_all('link', rel='stylesheet'):
    tag.extract()

  new_tag = page.new_tag('link', rel='stylesheet', href=relative_css)
  page.head.append(new_tag)

"""
Convert links to other pages in the generated docs into relative paths to work offline.
If docs are being converted for just one library, links for docs outside the library are converted
to a link to the published version.
"""
def fix_links(page, page_path, root_input_path, library):
  for a_tag in page.find_all('a'):
    original_path = a_tag.get('href')
    if original_path is None:
      continue
    if not original_path.startswith('/'):
      continue
    lstrip_original_path = original_path.lstrip('/')

    if page_should_be_linked(lstrip_original_path, library):
      abs_path = os.path.join(root_input_path, lstrip_original_path)
      abs_dir = os.path.dirname(abs_path)
      # Make sure the link will work -- this uses the directory because the basename of the path
      # might end with something like `Class.html#function`
      if os.path.exists(abs_dir):
        rel_path = os.path.relpath(abs_path, start=os.path.dirname(page_path))
        a_tag['href'] = rel_path
        continue

    # The link isn't in this library or doesn't exist locally, use the published page.
    a_tag['href'] = PUBLISHED_DOCS_URL + original_path

"""
Determines whether to link to the local version of the page at path.
"""
def page_should_be_linked(path, library):
  # All library docs are generated, so all pages are linked.
  if library is None:
    return True

  # The index pages are the only ones outside of the library dir that will exist.
  if os.path.basename(path) in INDEX_PAGES:
    return True

  # Check if the page is in the library dir.
  common_path = os.path.commonpath([library, path])
  return common_path == library

"""
For the class and package index pages, removes all rows which link outside the library.
"""
def filter_index(page, library):
  for row in page.find_all('tr'):
    link = row.a.get('href')
    common_path = os.path.commonpath([link.lstrip('/'), library])
    if link is not None and common_path != library:
      row.extract()

if __name__ == '__main__':
  main()