Source code for autotest.client.cpuset

# Copyright 2007-2010 Google Inc.  Released under the GPL v2
__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"

# A basic cpuset/cgroup container manager for limiting memory use during tests
#   for use on kernels not running some site-specific container manager

import fcntl
import glob
import logging
import os
import re

from autotest.client import utils
from autotest.client.shared import error, utils_memory

SUPER_ROOT = ''      # root of all containers or cgroups
NO_LIMIT = (1 << 63) - 1   # containername/memory.limit_in_bytes if no limit

# propio service classes:

super_root_path = ''    # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
cpuset_prefix = None  # usually 'cpuset.'; '' on 2.6.18
fake_numa_containers = False  # container mem via numa=fake mem nodes, else pages
mem_isolation_on = False
node_mbytes = 0         # mbytes in one typical mem node
root_container_bytes = 0  # squishy limit on effective size of root container

[docs]def discover_container_style(): global super_root_path, cpuset_prefix global mem_isolation_on, fake_numa_containers global node_mbytes, root_container_bytes if super_root_path != '': return # already looked up if os.path.exists('/dev/cgroup/tasks'): # running on 2.6.26 or later kernel with containers on: super_root_path = '/dev/cgroup' cpuset_prefix = 'cpuset.' if get_boot_numa(): mem_isolation_on = fake_numa_containers = True else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot fake_numa_containers = False mem_isolation_on = os.path.exists( '/dev/cgroup/memory.limit_in_bytes') # TODO: handle possibility of where memcg is mounted as its own # cgroup hierarchy, separate from cpuset?? elif os.path.exists('/dev/cpuset/tasks'): # running on 2.6.18 kernel with containers on: super_root_path = '/dev/cpuset' cpuset_prefix = '' mem_isolation_on = fake_numa_containers = get_boot_numa() != '' else: # neither cpuset nor cgroup filesystem active: super_root_path = None cpuset_prefix = 'no_cpusets_or_cgroups_exist' mem_isolation_on = fake_numa_containers = False logging.debug('mem_isolation: %s', mem_isolation_on) logging.debug('fake_numa_containers: %s', fake_numa_containers) if fake_numa_containers: node_mbytes = int(mbytes_per_mem_node()) elif mem_isolation_on: # memcg-style containers # For now, limit total of all containers to using just 98% of system's # visible total ram, to avoid oom events at system level, and avoid # page reclaim overhead from going above kswapd highwater mark. system_visible_pages = utils_memory.memtotal() >> 2 usable_pages = int(system_visible_pages * 0.98) root_container_bytes = usable_pages << 12 logging.debug('root_container_bytes: %s', utils.human_format(root_container_bytes))
[docs]def need_mem_containers(): discover_container_style() if not mem_isolation_on: raise error.AutotestError('Mem-isolation containers not enabled ' 'by latest reboot')
[docs]def need_fake_numa(): discover_container_style() if not fake_numa_containers: raise error.AutotestError('fake=numa not enabled by latest reboot')
[docs]def full_path(container_name): discover_container_style() return os.path.join(super_root_path, container_name)
[docs]def unpath(container_path): return container_path[len(super_root_path) + 1:]
[docs]def cpuset_attr(container_name, attr): discover_container_style() return os.path.join(super_root_path, container_name, cpuset_prefix + attr)
[docs]def io_attr(container_name, attr): discover_container_style() # current version assumes shared cgroup hierarchy return os.path.join(super_root_path, container_name, 'io.' + attr)
[docs]def tasks_path(container_name): return os.path.join(full_path(container_name), 'tasks')
[docs]def mems_path(container_name): return cpuset_attr(container_name, 'mems')
[docs]def memory_path(container_name): return os.path.join(super_root_path, container_name, 'memory')
[docs]def cpus_path(container_name): return cpuset_attr(container_name, 'cpus')
[docs]def container_exists(name): return name is not None and os.path.exists(tasks_path(name))
[docs]def move_tasks_into_container(name, tasks): task_file = tasks_path(name) for task in tasks: try: logging.debug('moving task %s into container "%s"', task, name) utils.write_one_line(task_file, task) except Exception: if utils.pid_is_alive(task): raise # task exists but couldn't move it
# task is gone or zombie so ignore this exception
[docs]def move_self_into_container(name): me = str(os.getpid()) move_tasks_into_container(name, [me]) logging.debug('running self (pid %s) in container "%s"', me, name)
def _avail_mbytes_via_nodes(parent): # total mbytes of mem nodes available for new containers in parent free_nodes = available_exclusive_mem_nodes(parent) mbytes = nodes_avail_mbytes(free_nodes) # don't have exact model for how container mgr measures mem space # better here to underestimate than overestimate mbytes = max(mbytes - node_mbytes // 2, 0) return mbytes def _avail_bytes_via_pages(parent): # Get memory bytes available to parent container which could # be allocated exclusively to new child containers. # This excludes mem previously allocated to existing children. available = container_bytes(parent) mem_files_pattern = os.path.join(full_path(parent), '*', 'memory.limit_in_bytes') for mem_file in glob.glob(mem_files_pattern): child_container = unpath(os.path.dirname(mem_file)) available -= container_bytes(child_container) return available
[docs]def avail_mbytes(parent=SUPER_ROOT): # total mbytes available in parent, for exclusive use in new containers if fake_numa_containers: return _avail_mbytes_via_nodes(parent) else: return _avail_bytes_via_pages(parent) >> 20
[docs]def delete_leftover_test_containers(): # recover mems and cores tied up by containers of prior failed tests: for child in inner_containers_of(SUPER_ROOT): _release_container_nest(child)
[docs]def my_lock(lockname): # lockname is 'inner' lockdir = os.environ['AUTODIR'] lockname = os.path.join(lockdir, '.cpuset.lock.' + lockname) lockfile = open(lockname, 'w') fcntl.flock(lockfile, fcntl.LOCK_EX) return lockfile
[docs]def my_unlock(lockfile): fcntl.flock(lockfile, fcntl.LOCK_UN) lockfile.close()
# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
[docs]def rangelist_to_set(rangelist): result = set() if not rangelist: return result for x in rangelist.split(','): if re.match(r'^(\d+)$', x): result.add(int(x)) continue m = re.match(r'^(\d+)-(\d+)$', x) if m: start = int( end = int( result.update(set(range(start, end + 1))) continue msg = 'Cannot understand data input: %s %s' % (x, rangelist) raise ValueError(msg) return result
[docs]def my_container_name(): # Get current process's inherited or self-built container name # within /dev/cpuset or /dev/cgroup. Is '' for root container. name = utils.read_one_line('/proc/%i/cpuset' % os.getpid()) return name[1:] # strip leading /
[docs]def get_mem_nodes(container_name): # all mem nodes now available to a container, both exclusive & shared file_name = mems_path(container_name) if os.path.exists(file_name): return rangelist_to_set(utils.read_one_line(file_name)) else: return set()
def _busy_mem_nodes(parent_container): # Get set of numa memory nodes now used (exclusively or shared) # by existing children of parent container busy = set() mem_files_pattern = os.path.join(full_path(parent_container), '*', cpuset_prefix + 'mems') for mem_file in glob.glob(mem_files_pattern): child_container = os.path.dirname(mem_file) busy |= get_mem_nodes(child_container) return busy
[docs]def available_exclusive_mem_nodes(parent_container): # Get subset of numa memory nodes of parent container which could # be allocated exclusively to new child containers. # This excludes nodes now allocated to existing children. need_fake_numa() available = get_mem_nodes(parent_container) available -= _busy_mem_nodes(parent_container) return available
[docs]def my_mem_nodes(): # Get set of numa memory nodes owned by current process's container. discover_container_style() if not mem_isolation_on: return set() # as expected by vmstress return get_mem_nodes(my_container_name())
[docs]def my_available_exclusive_mem_nodes(): # Get subset of numa memory nodes owned by current process's # container, which could be allocated exclusively to new child # containers. This excludes any nodes now allocated # to existing children. return available_exclusive_mem_nodes(my_container_name())
[docs]def node_avail_kbytes(node): return node_mbytes << 10 # crude; fixed numa node size
[docs]def nodes_avail_mbytes(nodes): # nodes' combined user+avail size, in Mbytes return sum(node_avail_kbytes(n) for n in nodes) // 1024
[docs]def container_bytes(name): if fake_numa_containers: return nodes_avail_mbytes(get_mem_nodes(name)) << 20 else: while True: file = memory_path(name) + '.limit_in_bytes' limit = int(utils.read_one_line(file)) if limit < NO_LIMIT: return limit if name == SUPER_ROOT: return root_container_bytes name = os.path.dirname(name)
[docs]def container_mbytes(name): return container_bytes(name) >> 20
[docs]def mbytes_per_mem_node(): # Get mbyte size of standard numa mem node, as float # (some nodes are bigger than this) # Replaces utils_memory.node_size(). numa = get_boot_numa() if numa.endswith('M'): return float(numa[:-1]) # mbyte size of fake nodes elif numa: nodecnt = int(numa) # fake numa mem nodes for container isolation else: nodecnt = len(utils_memory.numa_nodes()) # phys mem-controller nodes # Use guessed total physical mem size, not kernel's # lesser 'available memory' after various system tables. return utils_memory.rounded_memtotal() / (nodecnt * 1024.0)
[docs]def get_cpus(container_name): file_name = cpus_path(container_name) if os.path.exists(file_name): return rangelist_to_set(utils.read_one_line(file_name)) else: return set()
[docs]def get_tasks(container_name): file_name = tasks_path(container_name) try: tasks = [x.rstrip() for x in open(file_name).readlines()] except IOError: if os.path.exists(file_name): raise tasks = [] # container doesn't exist anymore return tasks
[docs]def inner_containers_of(parent): pattern = os.path.join(full_path(parent), '*/tasks') return [unpath(os.path.dirname(task_file)) for task_file in glob.glob(pattern)]
def _release_container_nest(nest): # Destroy a container, and any nested sub-containers nest_path = full_path(nest) if os.path.exists(nest_path): # bottom-up walk of tree, releasing all nested sub-containers for child in inner_containers_of(nest): _release_container_nest(child) logging.debug("releasing container %s", nest) # Transfer any survivor tasks (e.g. self) to parent container parent = os.path.dirname(nest) move_tasks_into_container(parent, get_tasks(nest)) # remove the now-empty outermost container of this nest if os.path.exists(nest_path): os.rmdir(nest_path) # nested, or dead manager
[docs]def release_container(container_name=None): # Destroy a container my_container = my_container_name() if container_name is None: container_name = my_container _release_container_nest(container_name) displaced = my_container_name() if displaced != my_container: logging.debug('now running self (pid %d) in container "%s"', os.getpid(), displaced)
[docs]def remove_empty_prio_classes(prios): # remove prio classes whose set of allowed priorities is empty # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3' return ';'.join(p for p in prios.split(';') if p.split(':')[1])
[docs]def all_drive_names(): # list of all disk drives sda,sdb,... paths = glob.glob('/sys/block/sd*') if not paths: paths = glob.glob('/sys/block/hd*') return [os.path.basename(path) for path in paths]
[docs]def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL], io_shares=[95], io_limits=[0]): # set the propio controls for one container, for selected disks # writing directly to /dev/cgroup/container_name/io.io_service_level # without using containerd or # See wiki ProportionalIOScheduler for definitions # ioprio_classes: list of service classes, one per disk # using numeric propio service classes as used by kernel API, namely # 1: RT, Real Time, aka PROPIO_PRIO # 2: BE, Best Effort, aka PROPIO_NORMAL # 3: PROPIO_IDLE # io_shares: list of disk-time-fractions, one per disk, # as percentage integer 0..100 # io_limits: list of limit on/off, one per disk # 0: no limit, shares use of other containers' unused disk time # 1: limited, container's use of disk time is capped to given DTF # ioprio_classes defaults to best-effort # io_limit defaults to no limit, use slack time if not disks: # defaults to all drives disks = all_drive_names() io_shares = [io_shares[0]] * len(disks) ioprio_classes = [ioprio_classes[0]] * len(disks) io_limits = [io_limits[0]] * len(disks) if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares) and len(disks) == len(io_limits)): raise error.AutotestError('Unequal number of values for io controls') service_level = io_attr(container_name, 'io_service_level') if not os.path.exists(service_level): return # kernel predates propio features # or io cgroup is mounted separately from cpusets disk_infos = [] for disk, ioclass, limit, share in zip(disks, ioprio_classes, io_limits, io_shares): parts = (disk, str(ioclass), str(limit), str(share)) disk_info = ' '.join(parts) utils.write_one_line(service_level, disk_info) disk_infos.append(disk_info) logging.debug('set_io_controls of %s to %s', container_name, ', '.join(disk_infos))
[docs]def abbrev_list(vals): """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'.""" ranges = [] lower = 0 upper = -2 for val in sorted(vals) + [-1]: if val != upper + 1: if lower == upper: ranges.append(str(lower)) elif lower <= upper: ranges.append('%d-%d' % (lower, upper)) lower = val upper = val return ','.join(ranges)
[docs]def create_container_with_specific_mems_cpus(name, mems, cpus): need_fake_numa() os.mkdir(full_path(name)) utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1') utils.write_one_line(mems_path(name), ','.join(map(str, mems))) utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) logging.debug('container %s has %d cpus and %d nodes totalling %s bytes', name, len(cpus), len(get_mem_nodes(name)), utils.human_format(container_bytes(name)))
[docs]def create_container_via_memcg(name, parent, bytes, cpus): # create container via direct memcg cgroup writes os.mkdir(full_path(name)) nodes = utils.read_one_line(mems_path(parent)) utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes utils.write_one_line(memory_path(name) + '.limit_in_bytes', str(bytes)) utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) logging.debug('Created container %s directly via memcg,' ' has %d cpus and %s bytes', name, len(cpus), utils.human_format(container_bytes(name)))
def _create_fake_numa_container_directly(name, parent, mbytes, cpus): need_fake_numa() lockfile = my_lock('inner') # serialize race between parallel tests try: # Pick specific mem nodes for new cpuset's exclusive use # For now, arbitrarily pick highest available node numbers needed_kbytes = mbytes * 1024 nodes = sorted(list(available_exclusive_mem_nodes(parent))) kbytes = 0 nodecnt = 0 while kbytes < needed_kbytes and nodecnt < len(nodes): nodecnt += 1 kbytes += node_avail_kbytes(nodes[-nodecnt]) if kbytes < needed_kbytes: parent_mbytes = container_mbytes(parent) if mbytes > parent_mbytes: raise error.AutotestError( "New container's %d Mbytes exceeds " "parent container's %d Mbyte size" % (mbytes, parent_mbytes)) else: raise error.AutotestError( "Existing sibling containers hold " "%d Mbytes needed by new container" % ((needed_kbytes - kbytes) // 1024)) mems = nodes[-nodecnt:] create_container_with_specific_mems_cpus(name, mems, cpus) finally: my_unlock(lockfile)
[docs]def create_container_directly(name, mbytes, cpus): parent = os.path.dirname(name) if fake_numa_containers: _create_fake_numa_container_directly(name, parent, mbytes, cpus) else: create_container_via_memcg(name, parent, mbytes << 20, cpus)
[docs]def create_container_with_mbytes_and_specific_cpus(name, mbytes, cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0): """ Create a cpuset container and move job's current pid into it Allocate the list "cpus" of cpus to that container :param name: arbitrary string tag :param mbytes: reqested memory for job in megabytes :param cpus (None): list of cpu indices to associate with the cpuset defaults to all cpus avail with given root :param root: the parent cpuset to nest this new set within, '' unnested top-level container :param io: arguments for proportional IO containers :param move_in (True): Move current process into the new container now. :param timeout (must be 0): persist until explicitly deleted. """ need_mem_containers() if not container_exists(root): raise error.AutotestError('Parent container "%s" does not exist' % root) if cpus is None: # default to biggest container we can make under root cpus = get_cpus(root) else: cpus = set(cpus) # interface uses list if not cpus: raise error.AutotestError('Creating container with no cpus') name = os.path.join(root, name) # path relative to super_root if os.path.exists(full_path(name)): raise error.AutotestError('Container %s already exists' % name) create_container_directly(name, mbytes, cpus) set_io_controls(name, **io) if move_in: move_self_into_container(name) return name
[docs]def get_boot_numa(): # get boot-time numa=fake=xyz option for current boot # eg numa=fake=nnn, numa=fake=nnnM, or nothing label = 'numa=fake=' for arg in utils.read_one_line('/proc/cmdline').split(): if arg.startswith(label): return arg[len(label):] return ''