• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python2
2
3# Copyright 2017 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Script to upload metrics from apache access logs to Monarch."""
8
9from __future__ import print_function
10
11import argparse
12import re
13import sys
14
15import common
16
17from chromite.lib import ts_mon_config
18from chromite.lib import metrics
19
20from autotest_lib.site_utils.stats import log_daemon_common
21
22
23"""
24The log format is set to:
25  %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T
26
27These are documented as follows:
28  (from https://httpd.apache.org/docs/current/mod/mod_log_config.html)
29
30%h: Remote host
31%l: Remote logname (from identd, if supplied)
32%O: Bytes sent, including headers. May be zero in rare cases such as when a
33    request is aborted before a response is sent. You need to enable mod_logio
34    to use this.
35%p: The canonical Port of the server serving the request
36%r: First line of request
37%s: Status.  For requests that got internally redirected, this is
38    the status of the *original* request --- %...>s for the last.
39%t: Time, in common log format time format (standard english format)
40%T: The time taken to serve the request, in seconds.
41%u: Remote user (from auth; may be bogus if return status (%s) is 401)
42%v: The canonical ServerName of the server serving the request.
43"""
44
45# Lemma: a regex to match sections delimited be double-quotes ("), which
46# possible contained escaped quotes (\").
47# This works by matching non-quotes or the string r'\"' repeatedly; then it ends
48# when finding a quote (") preceeded by a character which is not a backslash.
49MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]'
50
51ACCESS_MATCHER = re.compile(
52    r'^'
53    r'\S+ \S+ \S+ \S+ '               # Ignore %v:%p %h %l %u
54    r'\[[^]]+\] '                     # Ignore %t
55    r'"('                             # Begin %r
56    r'(?P<request_method>\S+) '       # e.g. POST
57    r'(?P<endpoint>\S+)'              # e.g. /afe/server/noauth/rpc/
58    + MATCH_UNTIL_QUOTE +             # Ignore protocol (e.g. HTTP/1.1)
59    r'|-'                             # The request data might just be "-"
60    r')" '                            # End %r
61    r'(?P<response_code>\d+) '        # %>s (e.g. 200)
62    r'(?P<bytes_sent>\d+)'            # %O
63    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore Referer
64    r' "' + MATCH_UNTIL_QUOTE + '"'   # Ignore User-Agent
65    r' ?(?P<response_seconds>\d+?)'   # The server time in seconds
66    r'.*'                             # Allow adding extra stuff afterward.
67)
68
69ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds'
70ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes'
71
72
73# TODO(phobbs) use something more systematic than a whitelist.
74WHITELISTED_ENDPOINTS = frozenset((
75    '/',
76    '/afe/clear.cache.gif',
77    '/afe/Open+Sans:300.woff',
78    '/embedded_spreadsheet/autotest.EmbeddedSpreadsheetClient.nocache.js',
79    '/afe/afeclient.css',
80    '/afe/common.css',
81    '/afe/header.png',
82    '/afe/spinner.gif',
83    '/afe/standard.css',
84    '/afe/2371F6F3D4E42171A3563D94B7BF42BF.cache.html',
85    '/afe/autotest.AfeClient.nocache.js',
86    '/afe/',
87    '/new_tko/server/rpc/',
88    '/afe/server/rpc/',
89    '/___rPc_sWiTcH___',
90    '*',
91    '/afe/server/noauth/rpc/',
92))
93
94
95def EmitRequestMetrics(m):
96    """Emits metrics for each line in the access log.
97
98    @param m: A regex match object
99    """
100    # TODO(phobbs) use a memory-efficient structure to detect non-unique paths.
101    # We can't just include the endpoint because it will cause a cardinality
102    # explosion.
103    endpoint = SanitizeEndpoint(m.group('endpoint'))
104    fields = {
105        'request_method': m.groupdict().get('request_method', ''),
106        'endpoint': endpoint,
107        'response_code': int(m.group('response_code')),
108    }
109
110    # Request seconds and bytes sent are both extremely high cardinality, so
111    # they must be the VAL of a metric, not a metric field.
112    if m.group('response_seconds'):
113      response_seconds = int(m.group('response_seconds'))
114      metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(
115          response_seconds, fields=fields)
116
117    bytes_sent = int(m.group('bytes_sent'))
118    metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add(
119        bytes_sent, fields=fields)
120
121
122def SanitizeEndpoint(endpoint):
123    """Returns empty string if endpoint is not whitelisted.
124
125    @param endpoint: The endpoint to sanitize.
126    """
127    if endpoint in WHITELISTED_ENDPOINTS:
128        return endpoint
129    else:
130        return ''
131
132
133MATCHERS = [
134    (ACCESS_MATCHER, EmitRequestMetrics),
135]
136
137
138def ParseArgs():
139    """Parses the command line arguments."""
140    p = argparse.ArgumentParser(
141        description='Parses apache logs and emits metrics to Monarch')
142    p.add_argument('--output-logfile')
143    p.add_argument('--debug-metrics-file',
144                   help='Output metrics to the given file instead of sending '
145                   'them to production.')
146    return p.parse_args()
147
148
149def Main():
150    """Sets up logging and runs matchers against stdin."""
151    args = ParseArgs()
152    log_daemon_common.SetupLogging(args)
153
154    # Set up metrics sending and go.
155    ts_mon_args = {}
156    if args.debug_metrics_file:
157        ts_mon_args['debug_file'] = args.debug_metrics_file
158
159    with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
160                                             **ts_mon_args):
161      log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
162
163
164if __name__ == '__main__':
165    Main()
166