Source code for idmtools_platform_container.container_platform

"""
Here we implement the ContainerPlatform object.

Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
"""
import os
import docker
import platform
import subprocess
from uuid import uuid4
from docker.models.containers import Container
from typing import Union, NoReturn, List, Dict
from dataclasses import dataclass, field
from idmtools.core.interfaces.ientity import IEntity
from idmtools.entities import Suite
from idmtools.entities.experiment import Experiment
from idmtools.entities.simulation import Simulation
from idmtools_platform_container.container_operations.docker_operations import validate_container_running, \
    find_container_by_image, compare_mounts, find_running_job, get_container, CONTAINER_STATUS, restart_container, \
    is_docker_installed, is_docker_daemon_running
from idmtools_platform_container.platform_operations.simulation_operations import ContainerPlatformSimulationOperations
from idmtools_platform_container.utils.general import map_container_path
from idmtools_platform_container.utils.job_history import JobHistory
from idmtools_platform_file.file_platform import FilePlatform
from idmtools_platform_container.platform_operations.experiment_operations import ContainerPlatformExperimentOperations
from logging import getLogger, DEBUG

logger = getLogger(__name__)
user_logger = getLogger('user')


[docs]@dataclass(repr=False) class ContainerPlatform(FilePlatform): """ Container Platform definition. """ __CONTAINER_IMAGE = "docker-production-public.packages.idmod.org/idmtools/container-rocky-runtime:0.0.3" __CONTAINER_MOUNT = "/home/container_data" docker_image: str = field(default=None, metadata=dict(help="Docker image to run the container")) data_mount: str = field(default=None, metadata=dict(help="Data mount point in the container")) user_mounts: dict = field(default=None, metadata=dict(help="User-defined mounts")) container_prefix: str = field(default=None, metadata=dict(help="Container name prefix")) force_start: bool = field(default=False, metadata=dict(help="Force start a new container")) new_container: bool = field(default=False, metadata=dict(help="Start a new container")) include_stopped: bool = field(default=False, metadata=dict(help="Include stopped containers")) debug: bool = field(default=False, metadata=dict(help="Debug mode")) container_id: str = field(default=None, metadata=dict(help="Container Id")) def __post_init__(self): super().__post_init__() self._experiments = ContainerPlatformExperimentOperations(platform=self) self._simulations = ContainerPlatformSimulationOperations(platform=self) self.job_directory = os.path.abspath(self.job_directory) self.sym_link = False self.run_sequence = False if self.docker_image is None: self.docker_image = self.__CONTAINER_IMAGE if self.data_mount is None: self.data_mount = self.__CONTAINER_MOUNT if self.debug: root_logger = getLogger() root_logger.setLevel(DEBUG) # Check if Docker is installed and running if not is_docker_installed(): user_logger.error("Docker is not installed.") exit(-1) if not is_docker_daemon_running(): user_logger.error("Docker daemon is not running.") exit(-1)
[docs] def validate_container(self, container_id: str) -> str: """ Validate the container. Args: container_id: container id Returns: Container short id """ # Check if the container exists container = get_container(container_id) if not container: user_logger.warning(f"Container {container_id} is not found.") exit(-1) # Check if the container is in the right status if container.status not in CONTAINER_STATUS: user_logger.warning( f"Container {container_id} is in {container.status} status, but we only support status: {CONTAINER_STATUS}.") exit(-1) # Check if the container is running if we do not include stopped containers if not self.include_stopped and container.status != 'running': user_logger.warning(f"Container {container_id} is not running.") exit(-1) # Check if the container matches the platform mounts if not self.validate_mount(container): user_logger.warning(f"Container {container_id} does not match the platform mounts.") exit(-1) # Restart the container if it is not running if container.status != 'running': restart_container(container) return container.short_id
[docs] def run_items(self, items: Union[IEntity, List[IEntity]], **kwargs): """ Run items on the platform. Args: items: Runnable items kwargs: additional arguments Returns: None """ if self.container_id is not None: self.container_id = self.validate_container(self.container_id) super().run_items(items, **kwargs)
[docs] def submit_job(self, item: Union[Experiment, Simulation], dry_run: bool = False, **kwargs) -> NoReturn: """ Submit a Process job in a docker container. Args: item: Experiment or Simulation dry_run: True/False kwargs: keyword arguments used to expand functionality Returns: Any """ if dry_run: user_logger.info(f'\nDry run: {dry_run}') return if isinstance(item, Experiment): if logger.isEnabledFor(DEBUG): logger.debug("Run experiment on container!") # Check if the experiment is already running his_job = JobHistory.get_job(item.id) if his_job: job = find_running_job(item.id, his_job['CONTAINER']) if job: user_logger.warning(f"Experiment {item.id} is already running on Container {job.container_id}.") exit(-1) # Start the container if self.container_id is None: if logger.isEnabledFor(DEBUG): logger.debug("Check provided container!") self.container_id = self.check_container(**kwargs) # If the platform is Windows, convert the scripts to Linux format if platform.system() in ["Windows"]: if logger.isEnabledFor(DEBUG): logger.debug("Script runs on Windows!") self.convert_scripts_to_linux(item, **kwargs) # Submit the experiment/simulations if logger.isEnabledFor(DEBUG): logger.debug(f"Submit experiment/simulations to container: {self.container_id}") self.submit_experiment(item, **kwargs) # Save the job to history JobHistory.save_job(self.job_directory, self.container_id, item, self) elif isinstance(item, Simulation): raise NotImplementedError("submit_job directly for simulation is not implemented on ContainerPlatform.") else: raise NotImplementedError( f"Submit job is not implemented for {item.__class__.__name__} on ContainerPlatform.")
[docs] def check_container(self, **kwargs) -> str: """ Check the container status. Args: kwargs: keyword arguments used to expand functionality Returns: container id """ container_id = validate_container_running(self, **kwargs) return container_id
[docs] def start_container(self, **kwargs) -> str: """ Execute a command in a container. Args: kwargs: keyword arguments used to expand functionality Returns: container id """ # Create a Docker client client = docker.from_env() volumes = self.build_binding_volumes() # Run the container container = client.containers.run( self.docker_image, command="bash", volumes=volumes, stdin_open=True, tty=True, detach=True, name=f"{self.container_prefix}_{str(uuid4())}" if self.container_prefix else None ) return container.short_id
[docs] def convert_scripts_to_linux(self, experiment: Experiment, **kwargs) -> NoReturn: """ Convert the scripts to Linux format. Args: experiment: Experiment kwargs: keyword arguments used to expand functionality Returns: No return """ directory = self.get_container_directory(experiment) try: commands = [ f"cd {directory}", r"sed -i 's/\r//g' batch.sh;sed -i 's/\r//g' run_simulation.sh" ] # Constructing the overall command full_command = ["docker", "exec", self.container_id, "bash", "-c", ";".join(commands)] # Execute the command subprocess.run(full_command, stdout=subprocess.PIPE) except subprocess.CalledProcessError as e: user_logger.warning(f"Failed to convert script: {e}") except Exception as ex: user_logger.warning(f"Failed to convert script to Linux: {ex}")
[docs] def submit_experiment(self, experiment: Experiment, **kwargs) -> NoReturn: """ Submit an experiment to the container. Args: experiment: Experiment kwargs: keyword arguments used to expand functionality Returns: No return """ directory = self.get_container_directory(experiment) if logger.isEnabledFor(DEBUG): logger.debug(f"Directory: {directory}") logger.debug(f"container_id: {self.container_id}") try: # Commands to change directory and run the script command = f'exec -a "EXPERIMENT:{experiment.id}" bash batch.sh &' # Constructing the overall command full_command = ["docker", "exec", "--workdir", directory, self.container_id, "bash", "-c", command] # Execute the command using Popen for handling background processes subprocess.Popen(full_command) # Optionally, you can wait for a short period to ensure the command starts # process = subprocess.Popen(full_command) # process.wait(timeout=5) logger.debug(f"Submit experiment {experiment.id} successfully") except subprocess.TimeoutExpired: user_logger.error(f"Submit experiment {experiment.id} timed out") exit(-1) except Exception as ex: user_logger.error(f"Submit experiment {experiment.id} encounter Error: {ex}") exit(-1)
[docs] def build_binding_volumes(self) -> Dict: """ Build the binding volumes for the container. Returns: bindings in dict format """ volumes = { self.job_directory: {"bind": self.data_mount, "mode": "rw"} } # Add user-defined volume mappings if self.user_mounts is not None: for key, value in self.user_mounts.items(): volumes[key] = {"bind": value, "mode": "rw"} return volumes
[docs] def get_mounts(self) -> List: """ Build the mounts of the container. Returns: List of mounts (Dict) """ mounts = [] mount = {'Type': 'bind', 'Source': self.job_directory, 'Destination': self.data_mount, 'Mode': 'rw'} mounts.append(mount) # Add user-defined volume mappings if self.user_mounts is not None: for key, value in self.user_mounts.items(): mount = {'Type': 'bind', 'Source': key, 'Destination': value, 'Mode': 'rw'} mounts.append(mount) return mounts
[docs] def validate_mount(self, container: Union[str, Container]) -> bool: """ Compare the mounts of the container with the platform. Args: container: a container object or id. Returns: True/False """ if isinstance(container, str): ct = get_container(container) else: ct = container if ct is None: logger.warning(f"Container {container} is not found.") return False mounts1 = self.get_mounts() mounts2 = ct.attrs['Mounts'] return compare_mounts(mounts1, mounts2)
[docs] def get_container_directory(self, item: Union[Suite, Experiment, Simulation]) -> str: """ Get the container corresponding directory of an item. Args: item: Suite, Experiment or Simulation Returns: string Path """ item_dir = self.get_directory(item) item_container_dir = map_container_path(self.job_directory, self.data_mount, str(item_dir)) return item_container_dir
[docs] def retrieve_match_containers(self, image: str = None) -> List: """ Find the containers that match math the image. Args: image: docker image Returns: list of containers """ if image is None: image = self.docker_image container_found = find_container_by_image(image, self.include_stopped) container_match = [] if len(container_found) > 0: for status, containers in container_found.items(): for container in containers: if self.validate_mount(container): container_match.append((status, container)) if len(container_match) == 0: if logger.isEnabledFor(DEBUG): logger.debug(f"Found container with image {image}, but no one match platform mounts.") else: if logger.isEnabledFor(DEBUG): logger.debug(f"Not found container matching image {image}.") return container_match