1#!/usr/bin/env python2 2 3# Copyright 2017 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Script to upload metrics from apache access logs to Monarch.""" 8 9from __future__ import print_function 10 11import argparse 12import re 13import sys 14import urlparse 15 16import common 17 18from chromite.lib import ts_mon_config 19from chromite.lib import metrics 20 21from autotest_lib.site_utils.stats import log_daemon_common 22# Not used, but needed for importing rpc_interface. 23from autotest_lib.frontend import setup_django_environment 24from autotest_lib.frontend.afe import rpc_interface 25from autotest_lib.frontend.afe import moblab_rpc_interface 26 27 28""" 29The log format is set to: 30 %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T 31 32These are documented as follows: 33 (from https://httpd.apache.org/docs/current/mod/mod_log_config.html) 34 35%h: Remote host 36%l: Remote logname (from identd, if supplied) 37%O: Bytes sent, including headers. May be zero in rare cases such as when a 38 request is aborted before a response is sent. You need to enable mod_logio 39 to use this. 40%p: The canonical Port of the server serving the request 41%r: First line of request 42%s: Status. For requests that got internally redirected, this is 43 the status of the *original* request --- %...>s for the last. 44%t: Time, in common log format time format (standard english format) 45%T: The time taken to serve the request, in seconds. 46%u: Remote user (from auth; may be bogus if return status (%s) is 401) 47%v: The canonical ServerName of the server serving the request. 48""" 49 50# Lemma: a regex to match sections delimited be double-quotes ("), which 51# possible contained escaped quotes (\"). 52# This works by matching non-quotes or the string r'\"' repeatedly; then it ends 53# when finding a quote (") preceeded by a character which is not a backslash. 54MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]' 55 56ACCESS_MATCHER = re.compile( 57 r'^' 58 r'\S+ \S+ \S+ \S+ ' # Ignore %v:%p %h %l %u 59 r'\[[^]]+\] ' # Ignore %t 60 r'"(' # Begin %r 61 r'(?P<request_method>\S+) ' # e.g. POST 62 r'(?P<endpoint>\S+)' # e.g. /afe/server/noauth/rpc/ 63 + MATCH_UNTIL_QUOTE + # Ignore protocol (e.g. HTTP/1.1) 64 r'|-' # The request data might just be "-" 65 r')" ' # End %r 66 r'(?P<response_code>\d+) ' # %>s (e.g. 200) 67 r'(?P<bytes_sent>\d+)' # %O 68 r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore Referer 69 r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore User-Agent 70 r' ?(?P<response_seconds>\d+?)' # The server time in seconds 71 r'.*' # Allow adding extra stuff afterward. 72) 73 74ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds' 75ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes' 76 77RPC_ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/rpc/response_seconds' 78RPC_ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/rpc/response_bytes' 79RPC_METHOD_ARGUMENT = 'method' 80 81 82# TODO(phobbs) use a memory-efficient structure to detect non-unique paths. 83# We can't just include the endpoint because it will cause a cardinality 84# explosion. 85WHITELISTED_ENDPOINTS = frozenset(( 86 '/', 87 '/afe/', 88 '/new_tko/server/rpc/', 89 '/afe/server/rpc/', 90 '/___rPc_sWiTcH___', 91 '*', 92 '/afe/server/noauth/rpc/', 93)) 94 95 96# A bad actor could DOS Monarch by requesting millions of different RPC methods, 97# each of which would create a different stream. Only allow a whitelist of 98# methods to be recorded in Monarch. 99WHITELISTED_METHODS = (frozenset(dir(rpc_interface)) | 100 frozenset(dir(moblab_rpc_interface))) 101 102 103def EmitRequestMetrics(m): 104 """Emits metrics for each line in the access log. 105 106 @param m: A regex match object 107 """ 108 fields = { 109 'request_method': m.groupdict().get('request_method', ''), 110 'endpoint': SanitizeEndpoint(m.group('endpoint')), 111 'response_code': int(m.group('response_code')), 112 } 113 114 send_rpc_metrics = ( 115 '?' in m.group('endpoint') and '/rpc' in m.group('endpoint')) 116 if send_rpc_metrics: 117 EmitRPCMetrics(m) 118 119 # Request seconds and bytes sent are both extremely high cardinality, so 120 # they must be the VAL of a metric, not a metric field. 121 if m.group('response_seconds'): 122 response_seconds = int(m.group('response_seconds')) 123 metrics.SecondsDistribution(ACCESS_TIME_METRIC).add( 124 response_seconds, fields=fields) 125 126 bytes_sent = int(m.group('bytes_sent')) 127 metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add( 128 bytes_sent, fields=fields) 129 130 131def EmitRPCMetrics(m): 132 """Emit a special metric including the method when the request was an RPC.""" 133 fields = { 134 'request_method': m.groupdict().get('request_method', ''), 135 'rpc_method': ParseRPCMethod(m.group('endpoint')), 136 'response_code': int(m.group('response_code')), 137 } 138 139 if m.group('response_seconds'): 140 response_seconds = int(m.group('response_seconds')) 141 metrics.SecondsDistribution(RPC_ACCESS_TIME_METRIC).add( 142 response_seconds, fields=fields) 143 144 bytes_sent = int(m.group('bytes_sent')) 145 metrics.CumulativeDistribution(RPC_ACCESS_BYTES_METRIC).add( 146 bytes_sent, fields=fields) 147 148 149def ParseRPCMethod(url): 150 """Parses the RPC method from an RPC query string. 151 152 Args: 153 url: The URL requested. 154 """ 155 query = urlparse.urlparse(url).query 156 return urlparse.parse_qs(query)[RPC_METHOD_ARGUMENT][-1] 157 158 159def SanitizeEndpoint(endpoint): 160 """Returns empty string if endpoint is not whitelisted. 161 162 @param endpoint: The endpoint to sanitize. 163 """ 164 if endpoint in WHITELISTED_ENDPOINTS: 165 return endpoint 166 else: 167 return '' 168 169 170MATCHERS = [ 171 (ACCESS_MATCHER, EmitRequestMetrics), 172] 173 174 175def ParseArgs(): 176 """Parses the command line arguments.""" 177 p = argparse.ArgumentParser( 178 description='Parses apache logs and emits metrics to Monarch') 179 p.add_argument('--output-logfile') 180 p.add_argument('--debug-metrics-file', 181 help='Output metrics to the given file instead of sending ' 182 'them to production.') 183 return p.parse_args() 184 185 186def Main(): 187 """Sets up logging and runs matchers against stdin.""" 188 args = ParseArgs() 189 log_daemon_common.SetupLogging(args) 190 191 # Set up metrics sending and go. 192 ts_mon_args = {} 193 if args.debug_metrics_file: 194 ts_mon_args['debug_file'] = args.debug_metrics_file 195 196 with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics', 197 **ts_mon_args): 198 log_daemon_common.RunMatchers(sys.stdin, MATCHERS) 199 200 201if __name__ == '__main__': 202 Main() 203