Source code for idmtools.analysis.csv_analyzer

"""idmtools CSVAnalyzer.

Example of a csv analyzer to concatenate csv results into one csv from your experiment's simulations.

Copyright 2021, Bill & Melinda Gates Foundation. All rights reserved.
"""
import os
from typing import Dict
import pandas as pd
from idmtools.entities import IAnalyzer
from idmtools.entities.ianalyzer import ANALYSIS_ITEM_MAP_DATA_TYPE, ANALYZABLE_ITEM


[docs]class CSVAnalyzer(IAnalyzer):
    """
    Provides an analyzer for CSV output.

    Examples:
        .. _simple-csv-example:

        Simple Example
          This example covers the basic usage of the CSVAnalyzer

          .. literalinclude:: ../../examples/analyzers/example_analysis_CSVAnalyzer.py

        .. _multiple-csvs:

        Multiple CSVs
            This example covers analyzing multiple CSVs

            .. literalinclude:: ../../examples/analyzers/example_analysis_MultiCSVAnalyzer.py
    """
    # Arg option for analyzer init are uid, working_dir, parse (True to leverage the :class:`OutputParser`;
    # False to get the raw data in the :meth:`select_simulation_data`), and filenames
    # In this case, we want parse=True, and the filename(s) to analyze
[docs]    def __init__(self, filenames, output_path="output_csv"):
        """
        Initialize our analyzer.

        Args:
            filenames: Filenames we want to pull
            output_path: Output path to write the csv
        """
        super().__init__(parse=True, filenames=filenames)
        # Raise exception early if files are not csv files
        if not all(['csv' in os.path.splitext(f)[1].lower() for f in self.filenames]):
            raise Exception('Please ensure all filenames provided to CSVAnalyzer have a csv extension.')

        self.output_path = output_path

[docs]    def initialize(self):
        """
        Initialize on run. Create an output directory.

        Returns:
            None
        """
        self.output_path = os.path.join(self.working_dir, self.output_path)

        # Create the output path
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

    # Map is called to get for each simulation a data object (all the metadata of the simulations) and simulation object
[docs]    def map(self, data: ANALYSIS_ITEM_MAP_DATA_TYPE, simulation: ANALYZABLE_ITEM) -> pd.DataFrame:
        """
        Map each simulation/workitem data here.

        The data is a mapping of files -> content(in this case, dataframes since it is csvs parsed).

        Args:
            data: Data mapping of files -> content
            simulation: Simulation/Workitem we are mapping

        Returns:
            Items joined together into a dataframe.
        """
        # If there are 1 to many csv files, concatenate csv data columns into one dataframe
        concatenated_df = pd.concat(list(data.values()), axis=0, ignore_index=True, sort=True)
        return concatenated_df

    # In reduce, we are printing the simulation and result data filtered in map
[docs]    def reduce(self, all_data: Dict[ANALYZABLE_ITEM, pd.DataFrame]):
        """
        Reduce(combine) all the data from our mapping.

        Args:
            all_data: Mapping of our data in form Item(Simulation/Workitem) -> Mapped dataframe

        Returns:
            None
        """
        results = pd.concat(list(all_data.values()), axis=0,  # Combine a list of all the sims csv data column values
                            keys=[str(k.uid) for k in all_data.keys()],  # Add a hierarchical index with the keys option
                            names=['SimId'])  # Label the index keys you create with the names option
        results.index = results.index.droplevel(1)  # Remove default index

        # Make a directory labeled the exp id to write the csv results to
        first_sim = list(all_data.keys())[0]  # get first Simulation
        exp_id = first_sim.experiment.id  # Set the exp id from the first sim data
        output_folder = os.path.join(self.output_path, exp_id)
        os.makedirs(output_folder, exist_ok=True)
        results.to_csv(os.path.join(output_folder, self.__class__.__name__ + '.csv'))