1#!/usr/bin/env python2 2 3# Copyright 2017 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Script to upload metrics from apache access logs to Monarch.""" 8 9from __future__ import print_function 10 11import argparse 12import re 13import sys 14 15import common 16 17from chromite.lib import ts_mon_config 18from chromite.lib import metrics 19 20from autotest_lib.site_utils.stats import log_daemon_common 21 22 23""" 24The log format is set to: 25 %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T 26 27These are documented as follows: 28 (from https://httpd.apache.org/docs/current/mod/mod_log_config.html) 29 30%h: Remote host 31%l: Remote logname (from identd, if supplied) 32%O: Bytes sent, including headers. May be zero in rare cases such as when a 33 request is aborted before a response is sent. You need to enable mod_logio 34 to use this. 35%p: The canonical Port of the server serving the request 36%r: First line of request 37%s: Status. For requests that got internally redirected, this is 38 the status of the *original* request --- %...>s for the last. 39%t: Time, in common log format time format (standard english format) 40%T: The time taken to serve the request, in seconds. 41%u: Remote user (from auth; may be bogus if return status (%s) is 401) 42%v: The canonical ServerName of the server serving the request. 43""" 44 45# Lemma: a regex to match sections delimited be double-quotes ("), which 46# possible contained escaped quotes (\"). 47# This works by matching non-quotes or the string r'\"' repeatedly; then it ends 48# when finding a quote (") preceeded by a character which is not a backslash. 49MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]' 50 51ACCESS_MATCHER = re.compile( 52 r'^' 53 r'\S+ \S+ \S+ \S+ ' # Ignore %v:%p %h %l %u 54 r'\[[^]]+\] ' # Ignore %t 55 r'"(' # Begin %r 56 r'(?P<request_method>\S+) ' # e.g. POST 57 r'(?P<endpoint>\S+)' # e.g. /afe/server/noauth/rpc/ 58 + MATCH_UNTIL_QUOTE + # Ignore protocol (e.g. HTTP/1.1) 59 r'|-' # The request data might just be "-" 60 r')" ' # End %r 61 r'(?P<response_code>\d+) ' # %>s (e.g. 200) 62 r'(?P<bytes_sent>\d+)' # %O 63 r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore Referer 64 r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore User-Agent 65 r' ?(?P<response_seconds>\d+?)' # The server time in seconds 66 r'.*' # Allow adding extra stuff afterward. 67) 68 69ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds' 70ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes' 71 72 73# TODO(phobbs) use something more systematic than a whitelist. 74WHITELISTED_ENDPOINTS = frozenset(( 75 '/', 76 '/afe/clear.cache.gif', 77 '/afe/Open+Sans:300.woff', 78 '/embedded_spreadsheet/autotest.EmbeddedSpreadsheetClient.nocache.js', 79 '/afe/afeclient.css', 80 '/afe/common.css', 81 '/afe/header.png', 82 '/afe/spinner.gif', 83 '/afe/standard.css', 84 '/afe/2371F6F3D4E42171A3563D94B7BF42BF.cache.html', 85 '/afe/autotest.AfeClient.nocache.js', 86 '/afe/', 87 '/new_tko/server/rpc/', 88 '/afe/server/rpc/', 89 '/___rPc_sWiTcH___', 90 '*', 91 '/afe/server/noauth/rpc/', 92)) 93 94 95def EmitRequestMetrics(m): 96 """Emits metrics for each line in the access log. 97 98 @param m: A regex match object 99 """ 100 # TODO(phobbs) use a memory-efficient structure to detect non-unique paths. 101 # We can't just include the endpoint because it will cause a cardinality 102 # explosion. 103 endpoint = SanitizeEndpoint(m.group('endpoint')) 104 fields = { 105 'request_method': m.groupdict().get('request_method', ''), 106 'endpoint': endpoint, 107 'response_code': int(m.group('response_code')), 108 } 109 110 # Request seconds and bytes sent are both extremely high cardinality, so 111 # they must be the VAL of a metric, not a metric field. 112 if m.group('response_seconds'): 113 response_seconds = int(m.group('response_seconds')) 114 metrics.SecondsDistribution(ACCESS_TIME_METRIC).add( 115 response_seconds, fields=fields) 116 117 bytes_sent = int(m.group('bytes_sent')) 118 metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add( 119 bytes_sent, fields=fields) 120 121 122def SanitizeEndpoint(endpoint): 123 """Returns empty string if endpoint is not whitelisted. 124 125 @param endpoint: The endpoint to sanitize. 126 """ 127 if endpoint in WHITELISTED_ENDPOINTS: 128 return endpoint 129 else: 130 return '' 131 132 133MATCHERS = [ 134 (ACCESS_MATCHER, EmitRequestMetrics), 135] 136 137 138def ParseArgs(): 139 """Parses the command line arguments.""" 140 p = argparse.ArgumentParser( 141 description='Parses apache logs and emits metrics to Monarch') 142 p.add_argument('--output-logfile') 143 p.add_argument('--debug-metrics-file', 144 help='Output metrics to the given file instead of sending ' 145 'them to production.') 146 return p.parse_args() 147 148 149def Main(): 150 """Sets up logging and runs matchers against stdin.""" 151 args = ParseArgs() 152 log_daemon_common.SetupLogging(args) 153 154 # Set up metrics sending and go. 155 ts_mon_args = {} 156 if args.debug_metrics_file: 157 ts_mon_args['debug_file'] = args.debug_metrics_file 158 159 with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics', 160 **ts_mon_args): 161 log_daemon_common.RunMatchers(sys.stdin, MATCHERS) 162 163 164if __name__ == '__main__': 165 Main() 166