Source code for idmtools_calibra.analyzers.rmse_analyzer

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as sk_mse

from idmtools_calibra.analyzers.base_calibration_analyzer import BaseCalibrationAnalyzer


[docs]class RMSEAnalyzer(BaseCalibrationAnalyzer):

    _user_cost_fn = None

    # Setting up the reference data for use in the analyzer and identifying the model output file to compare against
    def __init__(
        self,
        site,
        dependent_column,
        independent_column,
        output_filename="output.csv"
    ):
        self.model_output_filename = '/'.join(['output', f'{output_filename}'])

        self.independent_column = independent_column
        self.dependent_column = dependent_column
        # I really want to not have to pass reference_type at all here. Should default
        # to dep col w/o having to even tell it.
        self.reference = site.get_reference_data()

        # rename reference data column to NOT conflict with model data column; it happens to have the same column
        # name in this example
        self.reference_column = f"{self.dependent_column}_reference"
        self.reference.rename(
            columns={self.dependent_column: self.reference_column}, inplace=True
        )
        super().__init__(filenames=[str(self.model_output_filename)])

    # Here we do what needs to be done once per simulation. Often, as below, the main goal is to line up comparable
    # reference and model data.
[docs]    def map(self, data, item):
        sim_df = data[str(self.model_output_filename)]

        # merge reference and model data
        merged = self.reference.merge(sim_df, on=self.independent_column)
        merged.index.name = "Index"

        # each thing returned by 'map' will be available for use in 'reduce'
        result = {
            "df": merged,
            "reference_column": self.reference_column,
            "data_column": self.dependent_column,
        }
        return result

[docs]    @staticmethod
    def set_custom_cost_fn( user_cost_fn ):
        RMSEAnalyzer._user_cost_fn = user_cost_fn 

    @staticmethod
    def _rmse(series1, series2, series3): # 3 is weights
        if RMSEAnalyzer._user_cost_fn:
            return RMSEAnalyzer._user_cost_fn( series1, series2, series3 )
        else:
            #return math.sqrt( np.average( ( series1-series2 ) ** 2, weights=series3 ) )
            return sk_mse( series1, series2, sample_weight=series3, squared=False )
            # alternative: 
            # but we would be introducing an sklearn dependency to calibra for the first time.

[docs]    def rmse(cls, df, data_column, reference_column):
        # optional third column for weighting. 
        weights = np.ones_like(df[data_column])
        if 'weights' in df.columns:
            weights = df['weights']

        return cls._rmse(series1=df[data_column], series2=df[reference_column], series3=weights)

[docs]    def compare(self, sample, data_column, reference_column):
        # we need to now group by Sim_Id within sample, which lets us compute scores on a per-replicate basis
        replicate_groups = sample.groupby(["Sim_Id"])
        # One rmse per replicate
        rmses = replicate_groups.apply(
            self.rmse, data_column=data_column, reference_column=reference_column
        )

        # low rmse is good, so we invert rmse to get 'likelihood score' (where higher is better). This is what calibra
        # assumes: higher scores are better.
        scores = 1 / rmses
        # computing replicate-averaged score, which is our score for the parameterization sample provided
        score = scores.mean()

        return score

[docs]    def reduce(self, all_data):
        """
        Combine the simulation data into a single table for all analyzed simulations.
        """
        data = {}
        reference_column = None
        data_column = None

        # Obtain and set the sample index on each mapped result so that we can properly identify which result
        # belongs to which sample and to support replicate handling.
        # Also identify which data columns are to be used in the scoring calculation, as reported by
        # the mapped results.
        for simulation, mapping_dict in all_data.items():
            sample_index = int(simulation.tags.get("__sample_index__"))
            key = (sample_index, simulation.id)
            data[key] = mapping_dict["df"]
            reference_column = reference_column or mapping_dict["reference_column"]
            data_column = data_column or mapping_dict["data_column"]

        # a bit of name manipulation for calibra's sake
        data = pd.concat(
            list(data.values()),
            axis=0,
            keys=list(data.keys()),
            names=["Sample", "Sim_Id"],
        )
        data.reset_index(level="Index", drop=True, inplace=True)

        # compare sim data to reference data and determine a match likelihood/score. Higher is better in calibra.
        results = (
            data.reset_index()
            .groupby(["Sample"])
            .apply(
                self.compare, reference_column=reference_column, data_column=data_column
            )
        )

        # the return of 'reduce' is a Series of N items, where N is the number of samples run with
        return results