Source code for idmtools_calibra.analyzers.rmse_analyzer

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error as sk_mse

from idmtools_calibra.analyzers.base_calibration_analyzer import BaseCalibrationAnalyzer


[docs]class RMSEAnalyzer(BaseCalibrationAnalyzer): _user_cost_fn = None # Setting up the reference data for use in the analyzer and identifying the model output file to compare against def __init__( self, site, dependent_column, independent_column, output_filename="output.csv" ): self.model_output_filename = '/'.join(['output', f'{output_filename}']) self.independent_column = independent_column self.dependent_column = dependent_column # I really want to not have to pass reference_type at all here. Should default # to dep col w/o having to even tell it. self.reference = site.get_reference_data() # rename reference data column to NOT conflict with model data column; it happens to have the same column # name in this example self.reference_column = f"{self.dependent_column}_reference" self.reference.rename( columns={self.dependent_column: self.reference_column}, inplace=True ) super().__init__(filenames=[str(self.model_output_filename)]) # Here we do what needs to be done once per simulation. Often, as below, the main goal is to line up comparable # reference and model data.
[docs] def map(self, data, item): sim_df = data[str(self.model_output_filename)] # merge reference and model data merged = self.reference.merge(sim_df, on=self.independent_column) merged.index.name = "Index" # each thing returned by 'map' will be available for use in 'reduce' result = { "df": merged, "reference_column": self.reference_column, "data_column": self.dependent_column, } return result
[docs] @staticmethod def set_custom_cost_fn( user_cost_fn ): RMSEAnalyzer._user_cost_fn = user_cost_fn
@staticmethod def _rmse(series1, series2, series3): # 3 is weights if RMSEAnalyzer._user_cost_fn: return RMSEAnalyzer._user_cost_fn( series1, series2, series3 ) else: #return math.sqrt( np.average( ( series1-series2 ) ** 2, weights=series3 ) ) return sk_mse( series1, series2, sample_weight=series3, squared=False ) # alternative: # but we would be introducing an sklearn dependency to calibra for the first time.
[docs] def rmse(cls, df, data_column, reference_column): # optional third column for weighting. weights = np.ones_like(df[data_column]) if 'weights' in df.columns: weights = df['weights'] return cls._rmse(series1=df[data_column], series2=df[reference_column], series3=weights)
[docs] def compare(self, sample, data_column, reference_column): # we need to now group by Sim_Id within sample, which lets us compute scores on a per-replicate basis replicate_groups = sample.groupby(["Sim_Id"]) # One rmse per replicate rmses = replicate_groups.apply( self.rmse, data_column=data_column, reference_column=reference_column ) # low rmse is good, so we invert rmse to get 'likelihood score' (where higher is better). This is what calibra # assumes: higher scores are better. scores = 1 / rmses # computing replicate-averaged score, which is our score for the parameterization sample provided score = scores.mean() return score
[docs] def reduce(self, all_data): """ Combine the simulation data into a single table for all analyzed simulations. """ data = {} reference_column = None data_column = None # Obtain and set the sample index on each mapped result so that we can properly identify which result # belongs to which sample and to support replicate handling. # Also identify which data columns are to be used in the scoring calculation, as reported by # the mapped results. for simulation, mapping_dict in all_data.items(): sample_index = int(simulation.tags.get("__sample_index__")) key = (sample_index, simulation.id) data[key] = mapping_dict["df"] reference_column = reference_column or mapping_dict["reference_column"] data_column = data_column or mapping_dict["data_column"] # a bit of name manipulation for calibra's sake data = pd.concat( list(data.values()), axis=0, keys=list(data.keys()), names=["Sample", "Sim_Id"], ) data.reset_index(level="Index", drop=True, inplace=True) # compare sim data to reference data and determine a match likelihood/score. Higher is better in calibra. results = ( data.reset_index() .groupby(["Sample"]) .apply( self.compare, reference_column=reference_column, data_column=data_column ) ) # the return of 'reduce' is a Series of N items, where N is the number of samples run with return results