Source code for executor.concurrent

# Programmer friendly subprocess wrapper.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: May 14, 2020
# URL: https://executor.readthedocs.io

"""
Support for concurrent external command execution.

The :mod:`executor.concurrent` module defines the :class:`CommandPool` class
which makes it easy to prepare a large number of external commands, group them
together in a pool, start executing a configurable number of external commands
simultaneously and wait for all external commands to finish. For fine grained
concurrency control please refer to the :attr:`~.ExternalCommand.dependencies`
and :attr:`~.ExternalCommand.group_by` properties of the
:class:`.ExternalCommand` class.
"""

# Standard library modules.
import logging
import multiprocessing
import os

# External dependencies.
from executor import ExternalCommandFailed
from executor import logger as parent_logger
from humanfriendly import Timer
from humanfriendly.terminal.spinners import Spinner
from humanfriendly.text import format, pluralize
from property_manager import PropertyManager, mutable_property

# Initialize a logger.
logger = logging.getLogger(__name__)


[docs]class CommandPool(PropertyManager):

    """
    Execute multiple external commands concurrently.

    After constructing a :class:`CommandPool` instance you add commands to it
    using :func:`add()` and when you're ready to run the commands you call
    :func:`run()`.
    """

[docs]    def __init__(self, concurrency=None, **options):
        """
        Initialize a :class:`CommandPool` object.

        :param concurrency: Override the value of :attr:`concurrency`.
        :param logs_directory: Override the value of :attr:`logs_directory`.
        """
        # Initialize instance variables.
        self.collected = set()
        self.commands = []
        # Transform `concurrency' from a positional into a keyword argument.
        if concurrency:
            options['concurrency'] = concurrency
        # Set writable properties based on keyword arguments.
        super(CommandPool, self).__init__(**options)

[docs]    @mutable_property
    def concurrency(self):
        """
        The number of external commands that the pool is allowed to run simultaneously.

        This is a positive integer number. It defaults to the return value of
        :func:`multiprocessing.cpu_count()` (which may not make much sense if
        your commands are I/O bound instead of CPU bound).

        Setting :attr:`concurrency` to one is a supported use case intended to
        make it easier for users of the :mod:`executor.concurrent` module to
        reuse the code they've built on top of command pools (if only for
        debugging, but there are lots of use cases :-).
        """
        return multiprocessing.cpu_count()

[docs]    @mutable_property
    def delay_checks(self):
        """
        Whether to postpone raising an exception until all commands have run (a boolean).

        If this option is :data:`True` (not the default) and a command with
        :attr:`.check` set to :data:`True` fails the command pool's execution
        is not aborted, instead all commands will be allowed to run. After all
        commands have finished a :exc:`CommandPoolFailed` exception will be
        raised that tells you which command(s) failed.
        """
        return False

[docs]    @mutable_property
    def logger(self):
        """
        The :class:`logging.Logger` object to use.

        If you are using Python's :mod:`logging` module and you find it
        confusing that command pool execution is logged under the
        :mod:`executor.concurrent` name space instead of the name space of the
        application or library using :mod:`executor` you can set this attribute
        to inject a custom (and more appropriate) logger.
        """
        return logger

[docs]    @mutable_property
    def logs_directory(self):
        """
        The pathname of a directory where captured output is stored (a string).

        If this property is set to the pathname of a directory (before any
        external commands have been started) the merged output of each external
        command is captured and stored in a log file in this directory. The
        directory will be created if it doesn't exist yet.

        Output will start appearing in the log files before the external
        commands are finished, this enables `tail -f`_ to inspect the progress
        of commands that are still running and emitting output.

        .. _tail -f: https://en.wikipedia.org/wiki/Tail_(Unix)#File_monitoring
        """

    @property
    def is_finished(self):
        """:data:`True` if all commands in the pool have finished (including retries), :data:`False` otherwise."""
        return self.num_finished == self.num_commands

    @property
    def num_commands(self):
        """The number of commands in the pool (an integer)."""
        return len(self.commands)

    @property
    def num_finished(self):
        """The number of commands in the pool that have already finished, including retries (an integer)."""
        return sum(cmd.is_finished_with_retries for id, cmd in self.commands)

    @property
    def num_failed(self):
        """The number of commands in the pool that failed (an integer)."""
        return sum(cmd.failed for id, cmd in self.commands)

    @property
    def num_running(self):
        """The number of currently running commands in the pool (an integer)."""
        return sum(cmd.is_running for id, cmd in self.commands)

    @property
    def running_groups(self):
        """
        A set of running command groups.

        The value of :attr:`running_groups` is a :class:`set` with the
        :attr:`~.ExternalCommand.group_by` values of all currently running
        commands (:data:`None` is never included in the set).
        """
        return set(
            cmd.group_by for id, cmd in self.commands
            if cmd.is_running and cmd.group_by is not None
        )

    @property
    def results(self):
        """
        A mapping of identifiers to external command objects.

        This is a dictionary with external command identifiers as keys (refer
        to :func:`add()`) and :class:`.ExternalCommand` objects as values. The
        :class:`.ExternalCommand` objects provide access to the return codes
        and/or output of the finished commands.
        """
        return dict(self.commands)

[docs]    @mutable_property
    def spinner(self):
        """
        Control if and how an animated spinner is shown when the command pool is active.

        The following values are supported:

        - The default value :data:`None` means "auto detect", which means the
          spinner is shown only when the process is connected to a terminal.

        - The value :data:`True` unconditionally enables the spinner.

        - The value :data:`False` unconditionally disables the spinner.

        - A :class:`.Spinner` object can be provided by the caller, giving them
          the chance to configure how the spinner behaves.
        """
        return None

    @property
    def unexpected_failures(self):
        """
        A list of :class:`~executor.ExternalCommand` objects that *failed unexpectedly*.

        The resulting list includes only commands where :attr:`.check` and
        :attr:`.failed` are both :data:`True`.
        """
        return [cmd for id, cmd in self.commands if cmd.check and cmd.failed]

[docs]    def add(self, command, identifier=None, log_file=None):
        """
        Add an external command to the pool of commands.

        :param command: The external command to add to the pool (an
                        :class:`.ExternalCommand` object).
        :param identifier: A unique identifier for the external command (any
                           value). When this parameter is not provided the
                           identifier is set to the number of commands in the
                           pool plus one (i.e. the first command gets id 1).
        :param log_file: Override the default log file name for the command
                         (the identifier with ``.log`` appended) in case
                         :attr:`logs_directory` is set.

        When a command is added to a command pool the following options are
        changed automatically:

        - The :attr:`~executor.ExternalCommand.asynchronous` property is set to
          :data:`True`. If you want the commands to execute with a concurrency
          of one then you should set :attr:`concurrency` to one.

        - The :attr:`~executor.ExternalCommand.tty` property is set to
          :data:`False` when :attr:`concurrency` is higher than one because
          interaction with multiple concurrent subprocesses in a single
          terminal is prone to serious miscommunication (when multiple
          subprocesses present an interactive prompt at the same time and the
          user tries to answer one of the prompts it will be impossible to tell
          which of the subprocesses will receive the user's reply).
        """
        # Configure the command to run asynchronously.
        command.asynchronous = True
        # Configure the command to run without a controlling terminal?
        if self.concurrency > 1:
            command.tty = False
        # Override the command's default logger?
        if command.logger == parent_logger:
            command.logger = self.logger
        # Pick a default identifier for the command?
        if identifier is None:
            identifier = len(self.commands) + 1
        # Configure logging of command output?
        if self.logs_directory:
            if log_file is None:
                log_file = '%s.log' % identifier
            pathname = os.path.join(self.logs_directory, log_file)
            directory = os.path.dirname(pathname)
            if not os.path.isdir(directory):
                os.makedirs(directory)
            handle = open(pathname, 'ab')
            command.stdout_file = handle
            command.stderr_file = handle
        # Add the command to the pool.
        self.commands.append((identifier, command))

[docs]    def get_spinner(self, timer):
        """Get a :class:`.Spinner` to be used by :func:`run()`."""
        if isinstance(self.spinner, Spinner):
            return self.spinner
        else:
            return Spinner(interactive=self.spinner, timer=timer)

[docs]    def run(self):
        """
        Keep spawning commands and collecting results until all commands have run.

        :returns: The value of :attr:`results`.
        :raises: Any exceptions raised by :func:`collect()`.

        This method calls :func:`spawn()` and :func:`collect()` in a loop until
        all commands registered using :func:`add()` have run and finished. If
        :func:`collect()` raises an exception any running commands are
        terminated before the exception is propagated to the caller.

        If you're writing code where you want to own the main loop then
        consider calling :func:`spawn()` and :func:`collect()` directly instead
        of using :func:`run()`.

        When :attr:`concurrency` is set to one, specific care is taken to make
        sure that the callbacks configured by :attr:`.start_event` and
        :attr:`.finish_event` are called in the expected (intuitive) order.
        """
        # Start spawning processes to execute the commands.
        timer = Timer()
        logger.debug("Preparing to run %s with a concurrency of %i ..",
                     pluralize(self.num_commands, "command"),
                     self.concurrency)
        try:
            with self.get_spinner(timer) as spinner:
                num_started = 0
                num_collected = 0
                while not self.is_finished:
                    # When concurrency is set to one (I know, initially it
                    # sounds like a silly use case, bear with me) I want the
                    # start_event and finish_event callbacks of external
                    # commands to fire in the right order. The following
                    # conditional is intended to accomplish this goal.
                    if self.concurrency > (num_started - num_collected):
                        num_started += self.spawn()
                    num_collected += self.collect()
                    spinner.step(label=format(
                        "Waiting for %i/%i %s",
                        self.num_commands - self.num_finished, self.num_commands,
                        "command" if self.num_commands == 1 else "commands",
                    ))
                    spinner.sleep()
        except Exception:
            if self.num_running > 0:
                logger.warning("Command pool raised exception, terminating running commands!")
            # Terminate commands that are still running.
            self.terminate()
            # Re-raise the exception to the caller.
            raise
        # Collect the output and return code of any commands not yet collected.
        self.collect()
        logger.debug("Finished running %s in %s.",
                     pluralize(self.num_commands, "command"),
                     timer)
        # Report the results to the caller.
        return self.results

[docs]    def spawn(self):
        """
        Spawn additional external commands up to the :attr:`concurrency` level.

        :returns: The number of external commands that were spawned by this
                  invocation of :func:`spawn()` (an integer).

        The commands to start are picked according to three criteria:

        1. The command's :attr:`~.ExternalCommand.was_started` property is
           :data:`False`.
        2. The command's :attr:`~.ExternalCommand.group_by` value is not
           present in :attr:`running_groups`.
        3. The :attr:`~.ExternalCommand.is_finished_with_retries`
           properties of all of the command's :attr:`~.ExternalCommand.dependencies`
           are :data:`True`.
        """
        num_started = 0
        limit = self.concurrency - self.num_running
        if limit > 0:
            running_groups = self.running_groups
            for id, cmd in self.commands:
                # Skip commands that have already been started and cannot be retried.
                if (not cmd.was_started) or (cmd.retry_allowed and not cmd.is_running):
                    # If command groups are being used we'll only
                    # allow one running command per command group.
                    if cmd.group_by not in running_groups:
                        # If a command has any dependencies we won't allow it
                        # to start until all of its dependencies have finished.
                        if all(d.is_finished_with_retries for d in cmd.dependencies):
                            cmd.start()
                            num_started += 1
                            if cmd.group_by is not None:
                                running_groups.add(cmd.group_by)
                            if num_started == limit:
                                break
        if num_started > 0:
            logger.debug("Spawned %s ..", pluralize(num_started, "external command"))
        return num_started

[docs]    def collect(self):
        """
        Collect the exit codes and output of finished commands.

        :returns: The number of external commands that were collected by this
                  invocation of :func:`collect()` (an integer).
        :raises: If :attr:`delay_checks` is :data:`True`:
                  After all external commands have started and finished, if any
                  commands that have :attr:`~.ExternalCommand.check` set to
                  :data:`True` failed :exc:`CommandPoolFailed` is raised.
                 If :attr:`delay_checks` is :data:`False`:
                  The exceptions :exc:`.ExternalCommandFailed`,
                  :exc:`.RemoteCommandFailed` and :exc:`.RemoteConnectFailed`
                  can be raised if a command in the pool that has
                  :attr:`~.ExternalCommand.check` set to :data:`True` fails.
                  The :attr:`~.ExternalCommandFailed.pool` attribute of the
                  exception will be set to the pool.

        .. warning:: If an exception is raised, commands that are still running
                     will not be terminated! If this concerns you then consider
                     calling :func:`terminate()` from a :keyword:`finally`
                     block (this is what :func:`run()` does).
        """
        num_collected = 0
        for identifier, command in self.commands:
            if identifier not in self.collected and command.is_finished:
                try:
                    command.wait(check=False if self.delay_checks else None)
                except ExternalCommandFailed as e:
                    if not command.retry_allowed:
                        # Propagate exceptions that can't be retried.
                        e.pool = self
                        raise
                finally:
                    # Update our bookkeeping even if wait() raised an exception.
                    if not command.retry_allowed:
                        self.collected.add(identifier)
                # We count retries as collected commands in order to
                # preserve the symmetry between the return values of
                # spawn() and collect() because run() depends on it.
                num_collected += 1
        if num_collected > 0:
            logger.debug("Collected %s ..", pluralize(num_collected, "external command"))
        # Check if delayed error checking was requested and is applicable.
        if self.delay_checks and self.is_finished and self.unexpected_failures:
            raise CommandPoolFailed(pool=self)
        return num_collected

[docs]    def terminate(self):
        """
        Terminate any commands that are currently running.

        :returns: The number of commands that were terminated (an integer).

        If :func:`terminate()` successfully terminates commands, you then call
        :func:`collect()` and the :attr:`.check` property of a terminated
        command is :data:`True` you will get an exception because terminated
        commands (by definition) report a nonzero
        :attr:`~executor.ExternalCommand.returncode`.
        """
        num_terminated = 0
        for identifier, command in self.commands:
            if command.terminate():
                num_terminated += 1
        if num_terminated > 0:
            logger.warning("Terminated %s ..", pluralize(num_terminated, "external command"))
        return num_terminated


[docs]class CommandPoolFailed(Exception):

    """
    Raised by :func:`~CommandPool.collect()` when not all commands succeeded.

    This exception is only raised when :attr:`~CommandPool.delay_checks` is
    :data:`True`.
    """

[docs]    def __init__(self, pool):
        """
        Initialize a :class:`CommandPoolFailed` object.

        :param pool: The :class:`CommandPool` object that triggered the
                     exception.
        """
        self.pool = pool
        super(CommandPoolFailed, self).__init__(self.error_message)

    @property
    def commands(self):
        """A shortcut for :attr:`.unexpected_failures`."""
        return self.pool.unexpected_failures

    @property
    def error_message(self):
        """An error message that explains which commands *failed unexpectedly* (a string)."""
        summary = format("%i out of %s failed unexpectedly:",
                         self.pool.num_failed,
                         pluralize(self.pool.num_commands, "command"))
        details = "\n".join(" - %s" % cmd.error_message for cmd in self.commands)
        return summary + "\n\n" + details