Source code for idmtools_platform_comps.comps_operations.simulation_operations

"""idmtools comps simulation operations.

Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
"""
from concurrent.futures.thread import ThreadPoolExecutor
from contextlib import suppress
from threading import Lock
import time
import json
import uuid
from dataclasses import dataclass, field
from COMPS.Data.Simulation import SimulationState
from functools import partial
from logging import getLogger, DEBUG
from typing import Any, List, Dict, Type, Optional, TYPE_CHECKING
from uuid import UUID
from COMPS.Data import Simulation as COMPSSimulation, QueryCriteria, Experiment as COMPSExperiment, SimulationFile, \
    Configuration
from idmtools.assets import AssetCollection, Asset
from idmtools.core import ItemType, EntityStatus
from idmtools.core.task_factory import TaskFactory
from idmtools.entities import CommandLine
from idmtools.entities.command_task import CommandTask
from idmtools.entities.experiment import Experiment
from idmtools.entities.iplatform_ops.iplatform_simulation_operations import IPlatformSimulationOperations
from idmtools.entities.iplatform_ops.utils import batch_create_items
from idmtools.entities.simulation import Simulation
from idmtools.utils.json import IDMJSONEncoder
from idmtools_platform_comps.utils.general import convert_comps_status, get_asset_for_comps_item, clean_experiment_name
from idmtools_platform_comps.utils.scheduling import scheduled

if TYPE_CHECKING:  # pragma: no cover
    from idmtools_platform_comps.comps_platform import COMPSPlatform

logger = getLogger(__name__)
user_logger = getLogger('user')

# Track commissioning in batches
COMPS_EXPERIMENT_BATCH_COMMISSION_LOCK = Lock()
COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP = 0


[docs]def comps_batch_worker(simulations: List[Simulation], interface: 'CompsPlatformSimulationOperations', executor, num_cores: Optional[int] = None, priority: Optional[str] = None, asset_collection_id: str = None, min_time_between_commissions: int = 10, **kwargs) -> List[COMPSSimulation]: """ Run batch worker. Args: simulations: Batch of simulation to process interface: SimulationOperation Interface executor: Thread/process executor num_cores: Optional Number of core to allocate for MPI priority: Optional Priority to set to asset_collection_id: Override asset collection id min_time_between_commissions: Minimum amount of time(in seconds) between calls to commission on an experiment extra info for Returns: List of Comps Simulations """ global COMPS_EXPERIMENT_BATCH_COMMISSION_LOCK, COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP if logger.isEnabledFor(DEBUG): logger.debug(f'Converting {len(simulations)} to COMPS') created_simulations = [] new_sims = 0 for simulation in simulations: if simulation.status is None: interface.pre_create(simulation) new_sims += 1 simulation.platform = interface.platform simulation._platform_object = interface.to_comps_sim(simulation, num_cores=num_cores, priority=priority, asset_collection_id=asset_collection_id, **kwargs) created_simulations.append(simulation) if logger.isEnabledFor(DEBUG): logger.debug(f'Finished converting to COMPS. Starting saving of {len(simulations)}') COMPSSimulation.save_all(None, save_semaphore=COMPSSimulation.get_save_semaphore()) if logger.isEnabledFor(DEBUG): logger.debug(f'Finished saving of {len(simulations)}. Starting post_create') for simulation in simulations: simulation.uid = simulation.get_platform_object().id simulation.status = convert_comps_status(simulation.get_platform_object().state) interface.post_create(simulation) if logger.isEnabledFor(DEBUG): logger.debug(f'Finished post-create of {len(simulations)}') current_time = time.time() # check current commission queue and last commission call if new_sims > 0 and current_time - COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP > min_time_between_commissions: # be aggressive in waiting on lock. Worse case, another thread triggers this near same time locked = COMPS_EXPERIMENT_BATCH_COMMISSION_LOCK.acquire(timeout=0.015) if locked: # do commission asynchronously. If it fine if we happen to miss a commission def do_commission(): with suppress(RuntimeError): simulations[0].experiment.get_platform_object().commission() executor.submit(do_commission) COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP = current_time COMPS_EXPERIMENT_BATCH_COMMISSION_LOCK.release() for simulation in simulations: simulation.status = EntityStatus.RUNNING return simulations
[docs]@dataclass class CompsPlatformSimulationOperations(IPlatformSimulationOperations): """Provides simuilation operations to COMPSPlatform.""" platform: 'COMPSPlatform' # noqa F821 platform_type: Type = field(default=COMPSSimulation)
[docs] def get(self, simulation_id: UUID, columns: Optional[List[str]] = None, load_children: Optional[List[str]] = None, query_criteria: Optional[QueryCriteria] = None, **kwargs) -> COMPSSimulation: """ Get Simulation from Comps. Args: simulation_id: ID columns: Optional list of columns to load. Defaults to "id", "name", "experiment_id", "state" load_children: Optional children to load. Defaults to "tags", "configuration" query_criteria: Optional query_criteria object to use your own custom criteria object **kwargs: Returns: COMPSSimulation """ columns = columns or ["id", "name", "experiment_id", "state"] children = load_children if load_children is not None else ["tags", "configuration", "files"] query_criteria = query_criteria or QueryCriteria().select(columns).select_children(children) return COMPSSimulation.get( id=simulation_id, query_criteria=query_criteria )
[docs] def platform_create(self, simulation: Simulation, num_cores: int = None, priority: str = None, enable_platform_task_hooks: bool = True, asset_collection_id: str = None, **kwargs) -> COMPSSimulation: """ Create Simulation on COMPS. Args: simulation: Simulation to create num_cores: Optional number of MPI Cores to allocate priority: Priority to load enable_platform_task_hooks: Should platform task hoooks be ran asset_collection_id: Override for asset collection id on sim **kwargs: Expansion fields Returns: COMPS Simulation """ from idmtools_platform_comps.utils.python_version import platform_task_hooks if enable_platform_task_hooks: simulation.task = platform_task_hooks(simulation.task, self.platform) s = self.to_comps_sim(simulation, num_cores=num_cores, priority=priority, asset_collection_id=asset_collection_id, **kwargs) COMPSSimulation.save(s, save_semaphore=COMPSSimulation.get_save_semaphore()) return s
[docs] def to_comps_sim(self, simulation: Simulation, num_cores: int = None, priority: str = None, config: Configuration = None, asset_collection_id: str = None, **kwargs): """ Covert IDMTools object to COMPS Object. Args: simulation: Simulation object to convert num_cores: Optional Num of MPI Cores to allocate priority: Optional Priority config: Optional Configuration object asset_collection_id: **kwargs additional option for comps Returns: COMPS Simulation """ if config is None: if asset_collection_id and isinstance(asset_collection_id, str): asset_collection_id = uuid.UUID(asset_collection_id) kwargs['num_cores'] = num_cores kwargs['priority'] = priority kwargs['asset_collection_id'] = asset_collection_id kwargs.update(simulation._platform_kwargs) config = self.get_simulation_config_from_simulation(simulation, **kwargs) if simulation.name: simulation.name = clean_experiment_name(simulation.name) s = COMPSSimulation( name=clean_experiment_name(simulation.experiment.name if not simulation.name else simulation.name), experiment_id=simulation.parent_id, configuration=config ) self.send_assets(simulation, s, **kwargs) s.set_tags(simulation.tags) simulation._platform_object = s return s
[docs] def get_simulation_config_from_simulation(self, simulation: Simulation, num_cores: int = None, priority: str = None, asset_collection_id: str = None, **kwargs) -> \ Configuration: """ Get the comps configuration for a Simulation Object. Args: simulation: Simulation num_cores: Optional Num of core for MPI priority: Optional Priority asset_collection_id: Override simulation asset_collection_id **kwargs additional option for comps Returns: Configuration """ global_scheduling = kwargs.get("scheduling", False) sim_scheduling = getattr(simulation, 'scheduling', False) scheduling = global_scheduling and sim_scheduling comps_configuration = dict() if global_scheduling: config = getattr(self.platform, 'comps_config', {}) comps_exp_config = Configuration(**config) else: comps_exp: COMPSExperiment = simulation.parent.get_platform_object() comps_exp_config: Configuration = comps_exp.configuration if asset_collection_id: comps_configuration['asset_collection_id'] = asset_collection_id if num_cores is not None and num_cores != comps_exp_config.max_cores: logger.info(f'Overriding cores for sim to {num_cores}') comps_configuration['max_cores'] = num_cores comps_configuration['min_cores'] = num_cores if priority is not None and priority != comps_exp_config.priority: logger.info(f'Overriding priority for sim to {priority}') comps_configuration['priority'] = priority if comps_exp_config.executable_path != simulation.task.command.executable: logger.info(f'Overriding executable_path for sim to {simulation.task.command.executable}') from idmtools_platform_comps.utils.python_version import platform_task_hooks platform_task_hooks(simulation.task, self.platform) comps_configuration['executable_path'] = simulation.task.command.executable sim_task = simulation.task.command.arguments + " " + simulation.task.command.options sim_task = sim_task.strip() if comps_exp_config.simulation_input_args != sim_task: logger.info(f'Overriding simulation_input_args for sim to {sim_task}') comps_configuration['simulation_input_args'] = sim_task if logger.isEnabledFor(DEBUG): logger.debug(f'Simulation config: {str(comps_configuration)}') if scheduling: comps_configuration.update(executable_path=None, node_group_name=None, min_cores=None, max_cores=None, exclusive=None, simulation_input_args=None) return Configuration(**comps_configuration)
[docs] def batch_create(self, simulations: List[Simulation], num_cores: int = None, priority: str = None, asset_collection_id: str = None, **kwargs) -> List[COMPSSimulation]: """ Perform batch creation of Simulations. Args: simulations: Simulation to create num_cores: Optional MPI Cores to allocate per simulation priority: Optional Priority asset_collection_id: Asset collection id for sim(overide experiment) **kwargs: Future expansion Returns: List of COMPSSimulations that were created """ global COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP executor = ThreadPoolExecutor() thread_func = partial(comps_batch_worker, interface=self, num_cores=num_cores, priority=priority, asset_collection_id=asset_collection_id, min_time_between_commissions=self.platform.min_time_between_commissions, executor=executor, **kwargs) results = batch_create_items( simulations, batch_worker_thread_func=thread_func, progress_description="Creating Simulations on Comps", unit="simulation" ) # Always commission again try: results[0].parent.get_platform_object().commission() except RuntimeError as ex: # occasionally we hit this because double commissioning. Its ok to ignore though because that means we have already commissioned this experiment if logger.isEnabledFor(DEBUG): logger.debug(f"COMMISSION Response: {ex.args}") COMPS_EXPERIMENT_BATCH_COMMISSION_TIMESTAMP = 0 # set commission here in comps objects to prevent commission in Experiment when batching for sim in results: sim.status = EntityStatus.RUNNING sim.get_platform_object()._state = SimulationState.CommissionRequested return results
[docs] def get_parent(self, simulation: Any, **kwargs) -> COMPSExperiment: """ Get the parent of the simulation. Args: simulation: Simulation to load parent for **kwargs: Returns: COMPSExperiment """ return self.platform._experiments.get(simulation.experiment_id, **kwargs) if simulation.experiment_id else None
[docs] def platform_run_item(self, simulation: Simulation, **kwargs): """For simulations, there is no running for COMPS.""" pass
[docs] def send_assets(self, simulation: Simulation, comps_sim: Optional[COMPSSimulation] = None, add_metadata: bool = False, **kwargs): """ Send assets to Simulation. Args: simulation: Simulation to send asset for comps_sim: Optional COMPSSimulation object to prevent reloading it add_metadata: Add idmtools metadata object **kwargs: Returns: None """ scheduling = kwargs.get("scheduling", False) and scheduled(simulation) if comps_sim is None: comps_sim = simulation.get_platform_object() for asset in simulation.assets: if asset.filename.lower() == 'workorder.json' and scheduling: comps_sim.add_file(simulationfile=SimulationFile(asset.filename, 'WorkOrder'), data=asset.bytes) else: comps_sim.add_file(simulationfile=SimulationFile(asset.filename, 'input'), data=asset.bytes) # add metadata if add_metadata: if logger.isEnabledFor(DEBUG): logger.debug("Creating idmtools metadata for simulation and task on COMPS") # later we should add some filtering for passwords and such here in case anything weird happens metadata = json.dumps(simulation.task.to_dict(), cls=IDMJSONEncoder) from idmtools import __version__ comps_sim.add_file( SimulationFile("idmtools_metadata.json", 'input', description=f'IDMTools {__version__}'), data=metadata.encode() )
[docs] def refresh_status(self, simulation: Simulation, additional_columns: Optional[List[str]] = None, **kwargs): """ Refresh status of a simulation. Args: simulation: Simulation to refresh additional_columns: Optional additional columns to load from COMPS **kwargs: Returns: None """ cols = ['state'] if additional_columns: cols.extend(additional_columns) s = COMPSSimulation.get(id=simulation.uid, query_criteria=QueryCriteria().select(cols)) simulation.status = convert_comps_status(s.state)
[docs] def to_entity(self, simulation: COMPSSimulation, load_task: bool = False, parent: Optional[Experiment] = None, load_parent: bool = False, load_metadata: bool = False, load_cli_from_workorder: bool = False, **kwargs) -> Simulation: """ Convert COMPS simulation object to IDM Tools simulation object. Args: simulation: Simulation object load_task: Should we load tasks. Defaults to No. This can increase the load items on fetchs parent: Optional parent object to prevent reloads load_parent: Force load of parent(Beware, This could cause loading loops) load_metadata: Should we load metadata by default. If load task is enabled, this is also enabled load_cli_from_workorder: Used with COMPS scheduling where the CLI is defined in our workorder **kwargs: Returns: Simulation object """ # Recreate the experiment if needed if parent is None or load_parent: if logger.isEnabledFor(DEBUG): logger.debug(f'Loading parent {simulation.experiment_id}') parent = self.platform.get_item(simulation.experiment_id, item_type=ItemType.EXPERIMENT) # Get a simulation obj = Simulation() obj.platform = self.platform obj._platform_object = simulation obj.parent = parent obj.experiment = parent # Set its correct attributes obj.uid = simulation.id obj.tags = simulation.tags obj.status = convert_comps_status(simulation.state) if simulation.files: obj.assets = self.platform._assets.to_entity(simulation.files) # should we load metadata metadata = self.__load_metadata_from_simulation(obj) if load_metadata else None if load_task: self._load_task_from_simulation(obj, simulation, metadata) else: obj.task = None self.__extract_cli(simulation, parent, obj, load_cli_from_workorder) # call task load options(load configs from files, etc) obj.task.reload_from_simulation(obj) return obj
[docs] def get_asset_collection_from_comps_simulation(self, simulation: COMPSSimulation) -> Optional[AssetCollection]: """ Get assets from COMPS Simulation. Args: simulation: Simulation to get assets from Returns: Simulation Asset Collection, if any. """ if simulation.configuration and simulation.configuration.asset_collection_id: return self.platform.get_item(simulation.configuration.asset_collection_id, ItemType.ASSETCOLLECTION) return None
def _load_task_from_simulation(self, simulation: Simulation, comps_sim: COMPSSimulation, metadata: Dict = None): """ Load task from the simulation object. Args: simulation: Simulation to populate with task comps_sim: Experiment object metadata: Metadata loaded to be used in the task object Returns: None """ simulation.task = None if comps_sim.tags and 'task_type' in comps_sim.tags: # check for metadata if not metadata: metadata = self.__load_metadata_from_simulation(simulation) try: if logger.isEnabledFor(DEBUG): logger.debug(f"Metadata: {metadata}") simulation.task = TaskFactory().create(comps_sim.tags['task_type'], **metadata) except Exception as e: user_logger.warning(f"Could not load task of type {comps_sim.tags['task_type']}. " f"Received error {str(e)}") logger.exception(e) # ensure we have loaded the configuration if comps_sim.configuration is None: comps_sim.refresh(QueryCriteria().select_children('configuration')) def __extract_cli(self, comps_sim, parent, simulation, load_cli_from_workorder): cli = self._detect_command_line_from_simulation(parent, comps_sim, simulation, load_cli_from_workorder) # if we could not find task, set it now, otherwise rebuild the cli if simulation.task is None: simulation.task = CommandTask(CommandLine.from_string(cli)) else: simulation.task.command = CommandLine.from_string(cli) @staticmethod def __load_metadata_from_simulation(simulation) -> Dict[str, Any]: """ Load IDMTools metadata from a simulation. Args: simulation: Returns: Metadata if found Raise: FileNotFoundError error if metadata not found """ metadata = None for file in simulation.assets: if file.filename == "idmtools_metadata.json": # load the asset metadata = json.loads(file.content.decode('utf-8')) return metadata raise FileNotFoundError(f"Cannot find idmtools_metadata.json on the simulation {simulation.uid}") @staticmethod def _detect_command_line_from_simulation(experiment, comps_sim, simulation, load_cli_from_workorder=False): """ Detect Command Line from the Experiment/Simulation objects. Args: experiment: Experiment object comps_sim: Comps sim object simulation: Simulation(idmtools) object load_cli_from_workorder: Should we load metadata. we use this to determine if we should load our workorder.json Returns: CommandLine Raise: ValueError when command cannot be detected """ cli = None # do we have a configuration? po: COMPSExperiment = experiment.get_platform_object() if po.configuration is None: po.refresh(QueryCriteria().select_children('configuration')) # simulation configuration for executable? if comps_sim.configuration and comps_sim.configuration.executable_path: cli = f'{comps_sim.configuration.executable_path}' if comps_sim.configuration.simulation_input_args: cli += " " + comps_sim.configuration.simulation_input_args.strip() elif po.configuration and po.configuration.executable_path: cli = f'{po.configuration.executable_path} {po.configuration.simulation_input_args.strip()}' if cli is None: # check if we should try to load our workorder if load_cli_from_workorder: # filter for workorder workorder_obj = None for a in simulation.assets: if getattr(a, '_platform_object', None) and isinstance(a._platform_object, SimulationFile) and a._platform_object.file_type == "WorkOrder": workorder_obj = a break # if assets if workorder_obj: asset: Asset = workorder_obj wo = json.loads(asset.content.decode('utf-8')) cli = wo['Command'] else: raise ValueError("Could not detect cli") else: # if user doesn't care oabout metadata don't error logger.debug( "Could not load the cli from simulation. This is usually due to the use of scheduling config.") cli = "WARNING_CouldNotLoadCLI" elif logger.isEnabledFor(DEBUG): logger.debug(f"Detected task CLI {cli}") return cli
[docs] def get_assets(self, simulation: Simulation, files: List[str], include_experiment_assets: bool = True, **kwargs) -> \ Dict[str, bytearray]: """ Fetch the files associated with a simulation. Args: simulation: Simulation files: List of files to download include_experiment_assets: Should we also load experiment assets? **kwargs: Returns: Dictionary of filename -> ByteArray """ # since assets could be in the common assets, we should check that firs # load comps config first comps_sim: COMPSSimulation = simulation.get_platform_object(load_children=["files", "configuration"]) if include_experiment_assets and ( comps_sim.configuration is None or comps_sim.configuration.asset_collection_id is None): if logger.isEnabledFor(DEBUG): logger.debug("Gathering assets from experiment first") exp_assets = get_asset_for_comps_item(self.platform, simulation.experiment, files, self.cache, load_children=["configuration"]) if exp_assets is None: exp_assets = dict() else: exp_assets = dict() exp_assets.update(get_asset_for_comps_item(self.platform, simulation, files, self.cache, comps_item=comps_sim)) return exp_assets
[docs] def list_assets(self, simulation: Simulation, common_assets: bool = False, **kwargs) -> List[Asset]: """ List assets for a simulation. Args: simulation: Simulation to load data for common_assets: Should we load asset files **kwargs: Returns: AssetCollection """ comps_sim: COMPSSimulation = simulation.get_platform_object(load_children=["files", "configuration"]) assets = [] # load non comps objects if comps_sim.files: assets = self.platform._assets.to_entity(comps_sim.files).assets if common_assets: # here we are loading the simulation assets sa = self.get_asset_collection_from_comps_simulation(comps_sim) if sa: assets.extend(sa.assets) return assets
[docs] def retrieve_output_files(self, simulation: Simulation): """ Retrieve the output files for a simulation. Args: simulation: Simulation to fetch files for Returns: List of output files for simulation """ metadata = simulation.get_platform_object().retrieve_output_file_info([]) assets = self.platform._assets.to_entity(metadata).assets return assets
[docs] def all_files(self, simulation: Simulation, common_assets: bool = False, outfiles: bool = True, **kwargs) -> List[Asset]: """ Returns all files for a specific simulation including experiments or non-assets. Args: simulation: Simulation all files common_assets: Include experiment assets outfiles: Include output files **kwargs: Returns: AssetCollection """ ac = AssetCollection() ac.add_assets(self.list_assets(simulation, **kwargs)) if common_assets: ac.add_assets(simulation.parent.assets) if outfiles: ac.add_assets(self.retrieve_output_files(simulation)) return ac.assets
[docs] def create_sim_directory_map(self, simulation_id: str) -> Dict: """ Build simulation working directory mapping. Args: simulation_id: simulation id Returns: Dict of simulation id as key and working dir as value """ from idmtools_platform_comps.utils.linux_mounts import set_linux_mounts, clear_linux_mounts set_linux_mounts(self.platform) comps_sim = self.platform.get_item(simulation_id, ItemType.SIMULATION, query_criteria=QueryCriteria().select(['id', 'state']).select_children( 'hpc_jobs'), raw=True) sim_map = {str(comps_sim.id): comps_sim.hpc_jobs[-1].working_directory} clear_linux_mounts(self.platform) return sim_map