deep_memory_profiler/lib/policy.py

# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import json
import logging
import os
import re


LOGGER = logging.getLogger('dmprof')

BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
POLICIES_JSON_PATH = os.path.join(BASE_PATH, 'policies.json')

# Heap Profile Policy versions

# POLICY_DEEP_1 DOES NOT include allocation_type columns.
# mmap regions are distincted w/ mmap frames in the pattern column.
POLICY_DEEP_1 = 'POLICY_DEEP_1'

# POLICY_DEEP_2 DOES include allocation_type columns.
# mmap regions are distincted w/ the allocation_type column.
POLICY_DEEP_2 = 'POLICY_DEEP_2'

# POLICY_DEEP_3 is in JSON format.
POLICY_DEEP_3 = 'POLICY_DEEP_3'

# POLICY_DEEP_3 contains typeinfo.
POLICY_DEEP_4 = 'POLICY_DEEP_4'


class Rule(object):
  """Represents one matching rule in a policy file."""

  def __init__(self,
               name,
               allocator_type,
               stackfunction_pattern=None,
               stacksourcefile_pattern=None,
               typeinfo_pattern=None,
               mappedpathname_pattern=None,
               mappedpermission_pattern=None,
               sharedwith=None):
    self._name = name
    self._allocator_type = allocator_type

    self._stackfunction_pattern = None
    if stackfunction_pattern:
      self._stackfunction_pattern = re.compile(
          stackfunction_pattern + r'\Z')

    self._stacksourcefile_pattern = None
    if stacksourcefile_pattern:
      self._stacksourcefile_pattern = re.compile(
          stacksourcefile_pattern + r'\Z')

    self._typeinfo_pattern = None
    if typeinfo_pattern:
      self._typeinfo_pattern = re.compile(typeinfo_pattern + r'\Z')

    self._mappedpathname_pattern = None
    if mappedpathname_pattern:
      self._mappedpathname_pattern = re.compile(mappedpathname_pattern + r'\Z')

    self._mappedpermission_pattern = None
    if mappedpermission_pattern:
      self._mappedpermission_pattern = re.compile(
          mappedpermission_pattern + r'\Z')

    self._sharedwith = []
    if sharedwith:
      self._sharedwith = sharedwith

  @property
  def name(self):
    return self._name

  @property
  def allocator_type(self):
    return self._allocator_type

  @property
  def stackfunction_pattern(self):
    return self._stackfunction_pattern

  @property
  def stacksourcefile_pattern(self):
    return self._stacksourcefile_pattern

  @property
  def typeinfo_pattern(self):
    return self._typeinfo_pattern

  @property
  def mappedpathname_pattern(self):
    return self._mappedpathname_pattern

  @property
  def mappedpermission_pattern(self):
    return self._mappedpermission_pattern

  @property
  def sharedwith(self):
    return self._sharedwith


class Policy(object):
  """Represents a policy, a content of a policy file."""

  def __init__(self, rules, version, components):
    self._rules = rules
    self._version = version
    self._components = components

  @property
  def rules(self):
    return self._rules

  @property
  def version(self):
    return self._version

  @property
  def components(self):
    return self._components

  def find_rule(self, component_name):
    """Finds a rule whose name is |component_name|. """
    for rule in self._rules:
      if rule.name == component_name:
        return rule
    return None

  def find_malloc(self, bucket):
    """Finds a matching component name which a given |bucket| belongs to.

    Args:
        bucket: A Bucket object to be searched for.

    Returns:
        A string representing a component name.
    """
    assert not bucket or bucket.allocator_type == 'malloc'

    if not bucket:
      return 'no-bucket'
    if bucket.component_cache:
      return bucket.component_cache

    stackfunction = bucket.symbolized_joined_stackfunction
    stacksourcefile = bucket.symbolized_joined_stacksourcefile
    typeinfo = bucket.symbolized_typeinfo
    if typeinfo.startswith('0x'):
      typeinfo = bucket.typeinfo_name

    for rule in self._rules:
      if (rule.allocator_type == 'malloc' and
          (not rule.stackfunction_pattern or
           rule.stackfunction_pattern.match(stackfunction)) and
          (not rule.stacksourcefile_pattern or
           rule.stacksourcefile_pattern.match(stacksourcefile)) and
          (not rule.typeinfo_pattern or rule.typeinfo_pattern.match(typeinfo))):
        bucket.component_cache = rule.name
        return rule.name

    assert False

  def find_mmap(self, region, bucket_set,
                pageframe=None, group_pfn_counts=None):
    """Finds a matching component which a given mmap |region| belongs to.

    It uses |bucket_set| to match with backtraces.  If |pageframe| is given,
    it considers memory sharing among processes.

    NOTE: Don't use Bucket's |component_cache| for mmap regions because they're
    classified not only with bucket information (mappedpathname for example).

    Args:
        region: A tuple representing a memory region.
        bucket_set: A BucketSet object to look up backtraces.
        pageframe: A PageFrame object representing a pageframe maybe including
            a pagecount.
        group_pfn_counts: A dict mapping a PFN to the number of times the
            the pageframe is mapped by the known "group (Chrome)" processes.

    Returns:
        A string representing a component name.
    """
    assert region[0] == 'hooked'
    bucket = bucket_set.get(region[1]['bucket_id'])
    assert not bucket or bucket.allocator_type == 'mmap'

    if not bucket:
      return 'no-bucket', None

    stackfunction = bucket.symbolized_joined_stackfunction
    stacksourcefile = bucket.symbolized_joined_stacksourcefile
    sharedwith = self._categorize_pageframe(pageframe, group_pfn_counts)

    for rule in self._rules:
      if (rule.allocator_type == 'mmap' and
          (not rule.stackfunction_pattern or
           rule.stackfunction_pattern.match(stackfunction)) and
          (not rule.stacksourcefile_pattern or
           rule.stacksourcefile_pattern.match(stacksourcefile)) and
          (not rule.mappedpathname_pattern or
           rule.mappedpathname_pattern.match(region[1]['vma']['name'])) and
          (not rule.mappedpermission_pattern or
           rule.mappedpermission_pattern.match(
               region[1]['vma']['readable'] +
               region[1]['vma']['writable'] +
               region[1]['vma']['executable'] +
               region[1]['vma']['private'])) and
          (not rule.sharedwith or
           not pageframe or sharedwith in rule.sharedwith)):
        return rule.name, bucket

    assert False

  def find_unhooked(self, region, pageframe=None, group_pfn_counts=None):
    """Finds a matching component which a given unhooked |region| belongs to.

    If |pageframe| is given, it considers memory sharing among processes.

    Args:
        region: A tuple representing a memory region.
        pageframe: A PageFrame object representing a pageframe maybe including
            a pagecount.
        group_pfn_counts: A dict mapping a PFN to the number of times the
            the pageframe is mapped by the known "group (Chrome)" processes.

    Returns:
        A string representing a component name.
    """
    assert region[0] == 'unhooked'
    sharedwith = self._categorize_pageframe(pageframe, group_pfn_counts)

    for rule in self._rules:
      if (rule.allocator_type == 'unhooked' and
          (not rule.mappedpathname_pattern or
           rule.mappedpathname_pattern.match(region[1]['vma']['name'])) and
          (not rule.mappedpermission_pattern or
           rule.mappedpermission_pattern.match(
               region[1]['vma']['readable'] +
               region[1]['vma']['writable'] +
               region[1]['vma']['executable'] +
               region[1]['vma']['private'])) and
          (not rule.sharedwith or
           not pageframe or sharedwith in rule.sharedwith)):
        return rule.name

    assert False

  @staticmethod
  def load(filename, filetype):
    """Loads a policy file of |filename| in a |format|.

    Args:
        filename: A filename to be loaded.
        filetype: A string to specify a type of the file.  Only 'json' is
            supported for now.

    Returns:
        A loaded Policy object.
    """
    with open(os.path.join(BASE_PATH, filename)) as policy_f:
      return Policy.parse(policy_f, filetype)

  @staticmethod
  def parse(policy_f, filetype):
    """Parses a policy file content in a |format|.

    Args:
        policy_f: An IO object to be loaded.
        filetype: A string to specify a type of the file.  Only 'json' is
            supported for now.

    Returns:
        A loaded Policy object.
    """
    if filetype == 'json':
      return Policy._parse_json(policy_f)
    else:
      return None

  JSON_COMMENT_REGEX = re.compile(r'//.*')

  @staticmethod
  def _parse_json(policy_f):
    """Parses policy file in json format.

    A policy file contains component's names and their stacktrace pattern
    written in regular expression.  Those patterns are matched against each
    symbols of each stacktraces in the order written in the policy file

    Args:
         policy_f: A File/IO object to read.

    Returns:
         A loaded policy object.
    """
    policy_json = policy_f.read()
    policy_json = re.sub(Policy.JSON_COMMENT_REGEX, '', policy_json)
    policy = json.loads(policy_json)

    rules = []
    for rule in policy['rules']:
      stackfunction = rule.get('stackfunction') or rule.get('stacktrace')
      stacksourcefile = rule.get('stacksourcefile')
      rules.append(Rule(
          rule['name'],
          rule['allocator'],  # allocator_type
          stackfunction,
          stacksourcefile,
          rule['typeinfo'] if 'typeinfo' in rule else None,
          rule.get('mappedpathname'),
          rule.get('mappedpermission'),
          rule.get('sharedwith')))

    return Policy(rules, policy['version'], policy['components'])

  @staticmethod
  def _categorize_pageframe(pageframe, group_pfn_counts):
    """Categorizes a pageframe based on its sharing status.

    Returns:
        'private' if |pageframe| is not shared with other processes.  'group'
        if |pageframe| is shared only with group (Chrome-related) processes.
        'others' if |pageframe| is shared with non-group processes.
    """
    if not pageframe:
      return 'private'

    if pageframe.pagecount:
      if pageframe.pagecount == 1:
        return 'private'
      elif pageframe.pagecount <= group_pfn_counts.get(pageframe.pfn, 0) + 1:
        return 'group'
      else:
        return 'others'
    else:
      if pageframe.pfn in group_pfn_counts:
        return 'group'
      else:
        return 'private'


class PolicySet(object):
  """Represents a set of policies."""

  def __init__(self, policy_directory):
    self._policy_directory = policy_directory

  @staticmethod
  def load(labels=None):
    """Loads a set of policies via the "default policy directory".

    The "default policy directory" contains pairs of policies and their labels.
    For example, a policy "policy.l0.json" is labeled "l0" in the default
    policy directory "policies.json".

    All policies in the directory are loaded by default.  Policies can be
    limited by |labels|.

    Args:
        labels: An array that contains policy labels to be loaded.

    Returns:
        A PolicySet object.
    """
    default_policy_directory = PolicySet._load_default_policy_directory()
    if labels:
      specified_policy_directory = {}
      for label in labels:
        if label in default_policy_directory:
          specified_policy_directory[label] = default_policy_directory[label]
        # TODO(dmikurube): Load an un-labeled policy file.
      return PolicySet._load_policies(specified_policy_directory)
    else:
      return PolicySet._load_policies(default_policy_directory)

  def __len__(self):
    return len(self._policy_directory)

  def __iter__(self):
    for label in self._policy_directory:
      yield label

  def __getitem__(self, label):
    return self._policy_directory[label]

  @staticmethod
  def _load_default_policy_directory():
    with open(POLICIES_JSON_PATH, mode='r') as policies_f:
      default_policy_directory = json.load(policies_f)
    return default_policy_directory

  @staticmethod
  def _load_policies(directory):
    LOGGER.info('Loading policy files.')
    policies = {}
    for label in directory:
      LOGGER.info('  %s: %s' % (label, directory[label]['file']))
      loaded = Policy.load(directory[label]['file'], directory[label]['format'])
      if loaded:
        policies[label] = loaded
    return PolicySet(policies)