site_tests/platform_PrinterPpds/log_reader.py

# Copyright 2018 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import re
import subprocess
import threading
import time

# A path of the log file
_PATH_LOG_FILE = '/var/log/messages'
# Time margin as a number of seconds. It is used during parsing of the log file.
# Sometimes, logs are not in strict datetime order. This value defines maximum
# possible datetime error in logs sequence.
_TIME_MARGIN = 2
# Max time in seconds to wait for CUPS logs to appear in the log file.
_TIME_MAX_WAIT = 100
# A list of filters that are installed by loading components.
# Currently, they are Star and Epson drivers.
_COMPONENT_FILTERS = ['rastertostar', 'rastertostarlm', 'epson-escpr-wrapper']


class LogReader():
    """
    An instance of this class is responsible for gathering data about CUPS jobs
    and extracting their results and pipelines by parsing the main log file. It
    can check for internal errors during a job execution and build a bash script
    simulating an executed pipeline. CUPS server must have logging level set to
    'debug' to make a pipeline extraction possible.

    """

    def _read_last_lines(self, lines_count):
        """
        The function reads given number of lines from the end of the log file.
        The last line is omitted if it is not ended with EOL character. If a
        whole file was read, count of returned lines may be smaller than
        requested number. Always at least one line is returned; if the file has
        no EOL characters an exception is raised.

        @param lines_count: a number of lines to read

        @returns a list of lines (as strings)

        """
        assert lines_count > 0
        argv = ['tail', '-n', '%d' % (lines_count+1), _PATH_LOG_FILE]
        p1 = subprocess.Popen(argv, stdout=subprocess.PIPE)
        out,err = p1.communicate()

        # It is possible for invalid UTF-8 to appear in the system log
        # (e.g. null bytes on unclean poweroff), but this doesn't
        # concern us, so we elect to ignore it.
        lines = out.decode(errors="ignore").split('\n')
        lines.pop()
        if len(lines) > lines_count:
            if len(lines) == 0:
                raise Exception('The file %s is empty' % _PATH_LOG_FILE)
            lines = lines[1:]
        return lines


    def _get_datetime_from_line(self, line):
        """
        Parse datetime from the given line read from the log file.

        @param line_count: single line from the log files

        @returns a datetime as a number of seconds from the epoch (float)

        """
        return time.mktime(time.strptime(line[:20], "%Y-%m-%dT%H:%M:%S."))


    def __init__(self):
        """
        Constructor.

        """
        # the last line read from the log file
        self._last_line_read = self._read_last_lines(1)[0]
        # logs obtained from the log file after preprocessing
        # key = a tuple (CUPS pid, job id)
        # value = list with logs
        self._logs = dict()
        # data extracted from logs
        # key = a tuple (CUPS pid, job id)
        # value = a tuple (result(bool),envp(dict),argv(list),filters(list))
        self._results = dict()
        # mapping from printer's name to a list of CUPS jobs
        # key = printer name
        # value = a list of tuples (CUPS pid, job id)
        self._printer_name_to_key = dict()
        # a lock for synchronization
        self._critical_section = threading.Lock()


    def _parse_logs(self):
        """
        This method read and parses new lines from the log file (all complete
        lines after the line saved in self._last_line_read). The lines are
        filtered and saved in self._logs. Dictionaries self._results and
        self._printer_name_to_key are updated accordingly.

        """
        # ==============================================
        # try to read lines from the end of the log file until the line from
        # self._last_line_read is found
        dt_last_line_read = self._get_datetime_from_line(self._last_line_read)
        lines_count = 1024
        pos_last_line_read = None
        while True:
            lines = self._read_last_lines(lines_count)
            # search for self._last_line_read in lines read from the file
            for index,line in enumerate(lines):
                dt_line = self._get_datetime_from_line(line)
                if dt_line > dt_last_line_read + _TIME_MARGIN:
                    # a datetime of the line is far behind, exit the loop
                    break
                if line == self._last_line_read:
                    # the line was found, save its index and exit the loop
                    pos_last_line_read = index
                    break
            # if the self._last_line_read was found, exit the 'while' loop
            if pos_last_line_read is not None:
                break
            # check if we can try to read more lines from the file to repeat
            # the search, if not then throw an exception
            if ( len(lines) < lines_count or
                    self._get_datetime_from_line(lines[0]) +
                    _TIME_MARGIN < dt_last_line_read ):
                raise Exception('Cannot find the last read line in the log'
                        ' file (maybe it has been erased?)')
            # increase the number of lines to read
            lines_count = lines_count * 2
        # check if there are any new lines, exit if not
        if pos_last_line_read + 1 == len(lines):
            return
        # update the field with the last line read from the log file
        self._last_line_read = lines[-1]
        # ==============================================
        # parse logs, select these ones from CUPS jobs and save them to
        # self._logs
        updated_jobs = set()
        for line in lines[pos_last_line_read+1:]:
            tokens = line.split(' ', 5)
            if ( len(tokens) < 6 or tokens[2][:6] != 'cupsd[' or
                    tokens[3] != '[Job' ):
                continue
            pid = int(tokens[2][6:-2])
            job_id = int(tokens[4][:-1])
            message = tokens[5]
            key = (pid,job_id)
            if key not in self._logs:
                self._logs[key] = []
            self._logs[key].append(message)
            updated_jobs.add(key)
        # ==============================================
        # try to interpret logs
        for key in updated_jobs:
            number_of_processes_to_check = None
            printer_name = None
            result = True
            envp = dict()
            argv = []
            filters = []
            for msg in self._logs[key]:
                if number_of_processes_to_check is None:
                    regexp = re.match(r'(\d) filters for job:', msg)
                    if regexp is not None:
                        number_of_processes_to_check = int(regexp.group(1)) + 1
                        assert number_of_processes_to_check > 0
                elif printer_name is None:
                    regexp = re.match(r'(\S+) \(\S+ to (\S+), cost \d+\)', msg)
                    if regexp is not None:
                        fltr = regexp.group(1)
                        if fltr == '-':
                            number_of_processes_to_check -= 1
                        else:
                            filters.append(fltr)
                        trg = regexp.group(2)
                        if trg.startswith('printer/') and '/' not in trg[8:]:
                            printer_name = trg[8:]
                else:
                    regexp = re.match(r'envp\[\d+\]="(\S+)=(\S+)"', msg)
                    if regexp is not None:
                        envp[regexp.group(1)] = regexp.group(2)
                        continue
                    regexp = re.match(r'argv\[(\d+)\]="(.+)"', msg)
                    if regexp is not None:
                        if len(argv) != int(regexp.group(1)):
                            raise Exception('Error when parsing argv for %s'
                                    % printer_name)
                        argv.append(regexp.group(2))
                        continue
                    if number_of_processes_to_check > 0:
                        regexp = re.match(r'PID \d+ \(\S+\) ', msg)
                        if regexp is not None:
                            number_of_processes_to_check -= 1
                            if ( not msg.endswith(' exited with no errors.') and
                                    'was terminated normally with signal'
                                    not in msg ):
                                result = False
            if number_of_processes_to_check == 0:
                # if it is a new job, update self._printer_name_to_key
                if key not in self._results:
                    if printer_name not in self._printer_name_to_key:
                        self._printer_name_to_key[printer_name] = [key]
                    else:
                        self._printer_name_to_key[printer_name].append(key)
                # update results
                self._results[key] = (result,envp,argv,filters)


    def _build_pipeline(
            self, printer_name, envp, argv, filters,
            path_ppd, path_doc, path_workdir):
        """
        This function tries to build a bash script containing a pipeline used
        internally by CUPS. All printer's parameters are passed by the field
        self._printers_data. Resultant script is saved in self._pipelines.

        @param printer_name: a printer's name
        @param envp: a dictionary with environment variables for the pipeline
        @param argv: a list with command line parameters for filters
        @param filters: a list of filters
        @param path_ppd: a path to PPD file (string)
        @param path_doc: a path to input document (string)
        @param path_workdir: a path to a directory, that should be used as home
                directory and temporary location by filters

        @returns a pipeline as a bash script (string)

        """
        # update paths set in environment variables
        envp['PPD'] = path_ppd
        envp['HOME'] = path_workdir
        envp['TMPDIR'] = path_workdir
        # drop the last argument, it is an input file
        argv.pop()
        # replace filter names by full path for Star and Epson drivers
        if filters[-1] in _COMPONENT_FILTERS:
            find_cmd = ['find', '/run/imageloader/', '-type', 'f', '-name']
            find_cmd.append(filters[-1])
            filters[-1] = subprocess.check_output(find_cmd).rstrip()
        # build and return the script
        script = '#!/bin/bash\nset -e\nset -o pipefail\n'
        for name, value in envp.items():
            script += ('export %s=%s\n' % (name, value))
        for ind, filt in enumerate(filters):
            if ind > 0:
                script += ('gzip -dc %d.doc.gz | ' % ind)
            script += ('(exec -a "%s" %s' % (argv[0], filt))
            for arg in argv[1:]:
                script += (' "%s"' % arg)
            if ind == 0:
                script += (' "%s"' % path_doc)
            script += (') 2>%d.err | gzip -9 >%d.doc.gz\n' % (ind+1, ind+1))
        return script


    def extract_result(
            self, printer_name,
            path_ppd=None, path_doc=None, path_workdir=None):
        """
        This function extract from the log file CUPS logs related to printer
        with given name. Extracted logs are also interpreted, the function
        checks for any failures. If all optional parameters are set, a pipeline
        used by CUPS is extracted from the logs. If some of required results
        cannot be extracted, an exception is raised. This function is
        thread-safe, it can be called simultaneously by many Python threads.
        However, all simultaneous calls must have different values of the
        printer_name parameter (calls with the same printer_name must be done
        sequentially).

        @param printer_name: a printer's name
        @param path_ppd: a path to PPD file (string), required for pipelinie
                extraction only
        @param path_doc: a path to input document (string), required for
                pipelinie extraction only
        @param path_workdir: a path to a directory, that should be used as home
                directory and temporary location by filters, required for
                pipelinie extraction only

        @returns a tuple of three variables:
                1. True if no errors were detected, False otherwise
                2. logs extracted from the log file (string)
                3. pipeline as a bash script (string), None if not requested

        @raises Exception in case of any errors

        """
        # time when we started efforts to get logs
        time_start = time.time()
        # all three parameters must be set to extract a pipeline
        pipeline_required = (path_ppd is not None and path_doc is not None
                and path_workdir is not None)
        try:
            # try to get and parse logs
            while True:
                with self._critical_section:
                    # leave the loop if we have everything
                    if printer_name in self._printer_name_to_key:
                        if not pipeline_required:
                            break
                        key = self._printer_name_to_key[printer_name][0]
                        envp = self._results[key][1]
                        argv = self._results[key][2]
                        if len(envp) > 0 and len(argv) > 0:
                            break
                    # save current time and try to parse the log file
                    time_last_read = time.time()
                    self._parse_logs()
                    # leave the loop if we have everything
                    if printer_name in self._printer_name_to_key:
                        if not pipeline_required:
                            break
                        key = self._printer_name_to_key[printer_name][0]
                        envp = self._results[key][1]
                        argv = self._results[key][2]
                        if len(envp) > 0 and len(argv) > 0:
                            break
                # raise an exception if we have been waiting too long
                if time_last_read - time_start > _TIME_MAX_WAIT:
                    raise Exception('Cannot parse logs for %s or they did not'
                            ' appear in %s within %d seconds from printing the'
                            ' document' % (printer_name, _PATH_LOG_FILE,
                            _TIME_MAX_WAIT))
                # wait 1 second and try again to parse acquired logs
                time.sleep(1)
            # retrieve required data
            with self._critical_section:
                if pipeline_required:
                    # key, envp, argv are already set in the loop above
                    filters = self._results[key][3]
                else:
                    key = self._printer_name_to_key[printer_name][0]
                no_errors = self._results[key][0]
                logs = '\n'.join(self._logs[key])
            # build a pipeline script if required
            if pipeline_required:
                pipeline = self._build_pipeline(printer_name, envp, argv,
                        filters, path_ppd, path_doc, path_workdir)
            else:
                pipeline = None
            # return results
            return no_errors, logs, pipeline
        finally:
            # cleanup
            if printer_name in self._printer_name_to_key:
                with self._critical_section:
                    key = self._printer_name_to_key[printer_name].pop(0)
                    if len(self._printer_name_to_key[printer_name]) == 0:
                        self._printer_name_to_key.pop(printer_name)
                    self._logs.pop(key)
                    self._results.pop(key)