Source code for matador.workflows.workflows

# coding: utf-8
# Distributed under the terms of the MIT License.

""" This module implements various workflows, ways
of chaining up different calculations at high-throughput.

"""

import abc
import os
import logging
from matador.utils.print_utils import dumps

LOG = logging.getLogger("run3")


[docs]class Workflow: """Workflow objects are bundles of calculations defined as :obj:`WorkflowStep` objects. Each :obj:`WorkflowStep` takes three arguments: the :obj:`matador.compute.ComputeTask` object used to run the calculations, the calculation parameters (which can be modified by each step), the seed name. Any subclass of Workflow must implement `preprocess` and `postprocess` methods (even if they just return True). Attributes: computer (:obj:`matador.compute.ComputeTask`): the object that will be running the computation. calc_doc (dict): the interim dictionary of structural and calculation parameters. seed (str): the root seed for the calculation. label (str): the name of the type of the Workflow object. success (bool): the status of the workflow. This is only set to True after final step completes, but BEFORE post-processing. steps (:obj:`list` of :obj:`WorkflowStep`): list of steps to be completed. """ def __init__(self, computer, calc_doc, seed, **workflow_kwargs): """Initialise the Workflow object from a :obj:`matador.compute.ComputeTask`, calculation parameters and the seed name. Parameters: computer (:obj:`matador.compute.ComputeTask`): the object that will be running the computation. calc_doc (dict): dictionary of structure and calculation parameters. seed (str): root seed for the calculation. Raises: RuntimeError: if any part of the calculation fails. """ self.computer = computer self.calc_doc = calc_doc self.seed = seed self.label = self.__class__.__name__ self.compute_dir = computer.compute_dir self.success = None self.steps = [] self.clean_after_step = [] self.workflow_params = workflow_kwargs LOG.info("Performing Workflow of type {} on {}".format(self.label, self.seed)) if self.computer.run3_settings.get("run3_settings") is not None: settings = self.computer.kwargs.get("run3_settings") # check that computer.exec was not overriden at cmd-line, then check settings file if ( settings.get("castep_executable") is not None and self.computer.executable == "castep" ): self.castep_executable = settings.get("castep_executable", "castep") self.computer.executable = self.castep_executable if settings.get("optados_executable") is not None: self.optados_executable = settings.get("optados_executable", "optados") self.computer.optados_executable = self.optados_executable self.preprocess() LOG.info( "Preprocessing completed, steps to perform: {}".format( [step.name for step in self.steps] ) ) try: self.run_steps() except RuntimeError as exc: LOG.critical("Workflow failed: calling postprocess()") self._clean_up() raise exc if self.success: self.postprocess() self._clean_up()
[docs] @abc.abstractmethod def preprocess(self): """This function is run at the start of the workflow, and is responsible for adding WorkflowStep objects to the Workflow. """
[docs] @abc.abstractmethod def postprocess(self): """This OPTIONAL function is run upon successful completion of all steps of the workflow and can be overloaded by the subclass to perform any postprocessing steps. This occurs *before* cleaning up the directory (i.e. moving to completed/bad_castep). """
def _clean_up(self, success=None): """This method moves files to `completed/` or `bad_castep/` depending on the status of the workflow. It will use the current seed of the computer, so this function can be called at intermediate steps if this seed changes. """ cwd = os.getcwd() if success is None: success = self.success if self.compute_dir: os.chdir(self.compute_dir) if success: LOG.info( "Writing results from compute dir of Workflow {} run to completed folder and tidying up.".format( self.label ) ) self.computer.mv_to_completed( self.computer.seed, keep=True, skip_existing=False ) else: LOG.info( "Writing results from compute dir of failed Workflow {} run to bad_castep folder and tidying up.".format( self.label ) ) self.computer.mv_to_bad(self.computer.seed) os.chdir(cwd) if success: LOG.info( "Writing results of Workflow {} run to completed folder and tidying up.".format( self.label ) ) self.computer.mv_to_completed( self.computer.seed, keep=True, skip_existing=True ) else: LOG.info( "Writing results of failed Workflow {} run to bad_castep folder and tidying up.".format( self.label ) ) self.computer.mv_to_bad(self.computer.seed)
[docs] def add_step( self, function, name, input_exts=None, output_exts=None, clean_after=False, **func_kwargs ): """Add a step to the workflow. Parameters: function (Function): the function to run in the step; must accept arguments of (self.computer, self.calc_doc, self.seed). name (str): the desired name for the step (human-readable). Keyword arguments: clean_after (bool): whether or not to clean up after this step is called func_kwargs (dict): any arguments to pass to function when called. """ self.steps.append( WorkflowStep( function, name, self.compute_dir, input_exts, output_exts, **func_kwargs ) ) self.clean_after_step.append(clean_after)
[docs] def run_steps(self): """Loop over steps and run them.""" try: if not self.steps: msg = "No steps added to Workflow!" LOG.error(msg) raise RuntimeError(msg) for ind, step in enumerate(self.steps): LOG.info("Running step {step.name}: {step.function}".format(step=step)) LOG.debug("Current state: " + dumps(self.calc_doc, indent=None)) success = step.run_step(self.computer, self.calc_doc, self.seed) if self.clean_after_step[ind]: self._clean_up(success=success) self.success = True except RuntimeError: self.success = False msg = "{} workflow exiting...".format(self.label) LOG.error(msg) raise RuntimeError(msg)
[docs]class WorkflowStep: """An individual step in a Workflow, defined by a Python function and a name. The function will be called with arguments (computer, calc_doc, seed) with the run_step method. Attributes: function (function): the function to call. name (str): the human-readable name of the step. compute_dir (str): the folder that computer will perform the calculation in. func_kwargs (dict): any extra kwargs to pass to the function. input_exts (list): list of input file extensions to cache after running. output_exts (list): list of output file extensions to cache after running. """ success = False def __init__( self, function, name, compute_dir=None, input_exts=None, output_exts=None, **func_kwargs ): """Construct a WorkflowStep from a function.""" LOG.debug("Constructing WorkflowStep: {}".format(name)) self.function = function self.name = name self.compute_dir = compute_dir self.func_kwargs = func_kwargs self.input_exts = input_exts self.output_exts = output_exts def _cache_files(self, seed, exts, mode, directory=None): """Copy any files <seed>.<ext> for ext in exts to <seed>.<ext>_<label>. Parameters: seed (str): seed for the workflow step. exts (:obj:`list` of :obj:`str`): list of file extensions, including '.'. mode (str): either 'in' (warning printed if file missing) or 'out' (no warning). """ import shutil import glob for ext in exts: if "*" in ext: srcs = glob.glob("{}{}".format(seed, ext)) else: srcs = ["{}{}".format(seed, ext)] for src in srcs: dst = src + "_{}".format(self.name) if os.path.isfile(src): shutil.copy2(src, dst, follow_symlinks=True) LOG.info("Backed up {} file {} to {}.".format(mode, src, dst)) else: if mode == "in": error = "Failed to cache input file {} for step {}.".format( src, self.name ) LOG.warning(error) def _cache_inputs(self, seed): """Save any input files for the WorkflowStep with appropriate suffix as determined by the WorkflowStep label. All files with <seed>.<ext> will be moved to <seed>.<ext>_<name>, for any <ext> inside the `input_exts` attribute. This is called after the WorkflowStep has finished, even if it does not succeed... Parameters: seed (str): seed for the workflow step. """ if self.input_exts is not None: self._cache_files(seed, self.input_exts, "in") def _cache_outputs(self, seed): """Save any output files for the WorkflowStep with appropriate suffix as determined by the WorkflowStep label. All files with <seed>.<ext> will be moved to <seed>.<ext>_<name>, for any <ext> inside the `output_exts` attribute. Parameters: seed (str): seed for the workflow step. """ if self.output_exts is not None: self._cache_files(seed, self.output_exts, "out")
[docs] def cache_files(self, seed): """Wrapper for calling both _cache_inputs and _cache_outputs, without throwing any errors. """ cwd = os.getcwd() if self.compute_dir is not None: os.chdir(self.compute_dir) self._cache_inputs(seed) self._cache_outputs(seed) if self.compute_dir is not None: os.chdir(cwd)
[docs] def run_step(self, computer, calc_doc, seed): """Run the workflow step. Parameters: computer (:obj:`matador.compute.ComputeTask`): the object that will be running the computation. calc_doc (dict): dictionary of structure and calculation parameters. seed (str): root seed for the calculation. Raises: RuntimeError: if any step fails. """ try: LOG.info("WorkflowStep {} starting...".format(self.name)) self.success = self.function(computer, calc_doc, seed, **self.func_kwargs) except RuntimeError as exc: msg = "WorkflowStep {} failed with error {}.".format(self.name, exc) LOG.error(msg) self.success = False self.cache_files(seed) raise exc if self.success is None: LOG.info( "WorkflowStep {} skipped, did you provide all the input files?".format( self.name ) ) return self.success if self.success: LOG.info("WorkflowStep {} completed successfully.".format(self.name)) else: LOG.warning("WorkflowStep {} was unsuccessful.".format(self.name)) self.cache_files(seed) return self.success