Source code for idmtools_calibra.algorithms.separatrix_bhm

import logging

import numpy as np
import pandas as pd

from idmtools_calibra.algorithms.next_point_algorithm import NextPointAlgorithm

logger = logging.getLogger(__name__)
user_logger = logging.getLogger('user')
try:
    from history_matching.gpc import GPC
except ImportError as ex:
    user_logger.error("Separatrix_BHM requires history matching")
    raise ex


[docs]class SeparatrixBHM(NextPointAlgorithm):
    """
    Separatrix using Bayesian History Matching

    The basic idea of Separatrix is that each simulation results in a success (+1) or a failure (-1), and the success probability varies as a function of the input parameters.  We seek an isocline of the the latent success probability function.
    """

    def __init__(self,
                 params,
                 constrain_sample_fn=lambda s: s,  # Allows the user to move samples in order to meet a constraint
                 implausibility_threshold=3,
                 # Essentially the risk tolerance.  Higher numbers will be more careful to not reject potentially good regions of parameter space, but the result is that more iteration/simulations will be required
                 target_success_probability=0.7,  # This is the success probability isocline that we seek to identify
                 num_past_iterations_to_include_in_metamodel=3,
                 # When emulating the latent success probability function, include simulation results from this many previous iterations.  NOTE: Set <0 to include all previous simulation.
                 samples_per_iteration=32,  # Number of samples to simulate on a typical iteration
                 samples_final_iteration=128,  # Number of samples in simulate on the final iteration
                 max_iterations=10,  # Number of iterations (determines when to use samples_final_iteration)
                 training_frac=0.8
                 # Fraction of simulations to use a training data (will plot in cyan instead of magenta)
                 ):

        super(SeparatrixBHM, self).__init__()
        self.args = locals()  # Store inputs in case set_state is called later and we want to override with new (user) args
        del self.args['self']
        self.need_resolve = False

        self.constrain_sample_fn = constrain_sample_fn
        self.max_iterations = max_iterations

        self.emulation = pd.DataFrame(columns=['Iteration', 'Sample', 'Mean', 'Var'])
        self.for_plotting = pd.DataFrame()

        self.param_info = pd.DataFrame(params).reset_index(drop=True).set_index('Name')
        self.param_names = self.param_info.index.values.tolist()

        self.implausibility_threshold = implausibility_threshold  # Check that does not go up over iterations!
        self.target_success_probability = target_success_probability  # This cannot change, make sure it doesn't!

        # These can change on an iteration-by-iteration basis
        self.num_past_iterations_to_include_in_metamodel = int(num_past_iterations_to_include_in_metamodel)
        self.samples_per_iteration = int(samples_per_iteration)
        self.samples_final_iteration = int(samples_final_iteration)
        self.training_frac = training_frac

        self.n_dimensions = len(self.param_info)
        self.x_min = {k: v['Min'] for k, v in self.param_info.iterrows()}
        self.x_max = {k: v['Max'] for k, v in self.param_info.iterrows()}

        # GPC hyperparameter guess and range bounds
        # TODO: Resolve these
        # TODO: Allow user to enter guess values and bounds!
        self.theta_guess = np.array(
            [20] + [0.5] * self.n_dimensions)  # sigma^2 followed by squared lengthscales for each dimension
        self.bounds = ((0.005, 100),) + tuple(self.n_dimensions * ((0.001, 2.0),))
        self.hyperparameters = pd.DataFrame(
            columns=['sigma^2'] + ['lengthscale^2_%d' % i for i in range(self.n_dimensions)] + ['fun',
                                                                                                'accepted percent',
                                                                                                'success'])
        self.hyperparameters.loc[0] = np.append([np.NaN] * (self.n_dimensions + 2), [100, np.NaN])
        self.gpc_vec = []

        self.data = pd.DataFrame()

[docs]    def cleanup(self):
        pass

[docs]    def resolve_args(self, iteration):
        # Have args from user and from set_state.
        # Note this is called only right before commissioning a new iteration, likely from 'resume'

        # TODO: be more sensitive with params, user could have added or removed variables, need to adjust
        # TODO: Check min <=  max for params
        # TODO: could clean this up with a helper function
        if 'params' in self.args['params']:
            self.param_info = pd.DataFrame(self.args['params']).reset_index(drop=True).set_index('Name')

        self.implausibility_threshold = self.args[
            'implausibility_threshold'] if 'implausibility_threshold' in self.args else self.implausibility_threshold
        self.target_success_probability = self.args[
            'target_success_probability'] if 'target_success_probability' in self.args else self.target_success_probability
        self.num_past_iterations_to_include_in_metamodel = self.args[
            'num_past_iterations_to_include_in_metamodel'] if 'num_past_iterations_to_include_in_metamodel' in self.args else self.num_past_iterations_to_include_in_metamodel
        self.training_frac = self.args['training_frac'] if 'training_frac' in self.args else self.training_frac
        self.samples_per_iteration = self.args[
            'samples_per_iteration'] if 'samples_per_iteration' in self.args else self.samples_per_iteration
        self.samples_final_iteration = self.args[
            'samples_final_iteration'] if 'samples_final_iteration' in self.args else self.samples_final_iteration

        self.n_dimensions = len(self.param_info)
        self.x_min = {k: v['Min'] for k, v in self.param_info.iterrows()}
        self.x_max = {k: v['Max'] for k, v in self.param_info.iterrows()}

        self.need_resolve = False

[docs]    def add_samples(self, samples, iteration):
        samples_cpy = samples.copy()
        samples_cpy.index.name = '__sample_index__'
        samples_cpy['Iteration'] = iteration
        samples_cpy.reset_index(inplace=True)

        self.data = pd.concat([self.data, samples_cpy], ignore_index=True)
        self.data['__sample_index__'] = self.data['__sample_index__'].astype(int)

[docs]    def get_samples_for_iteration(self, iteration):
        # Update args
        if self.need_resolve:
            self.resolve_args(iteration)

        if iteration == 0:
            # Choose initial samples by LHS
            samples = self.choose_initial_samples()
        else:
            # Choose initial samples by History Matching
            samples = self.choose_samples_via_history_matching(iteration)

        samples.reset_index(drop=True, inplace=True)
        return self.generate_samples_from_df(samples)

[docs]    def set_results_for_iteration(self, iteration, results):
        results = results.total.tolist()
        logger.info('%s: Choosing samples at iteration %d:', self.__class__.__name__, iteration)
        logger.debug('Results:\n%s', results)

        data_by_iter = self.data.set_index('Iteration')
        if iteration + 1 in data_by_iter.index.unique():
            # Been here before, reset
            data_by_iter = data_by_iter.loc[:iteration]

            emulation_by_iter = self.emulation_index('Iteration')
            self.emulation = emulation_by_iter.loc[:iteration - 1].reset_index()

        # Store results ... even if changed
        data_by_iter.loc[iteration, 'Results'] = results
        self.data = data_by_iter.reset_index()

[docs]    def set_gpc_vec(self, iteration, gpc):
        if len(self.gpc_vec) == iteration:
            self.gpc_vec.append(gpc)
        else:
            assert (len(self.gpc_vec) >= iteration)
            self.gpc_vec[iteration] = gpc

[docs]    def choose_initial_samples(self):
        self.data = pd.DataFrame(columns=['Iteration', '__sample_index__', 'Train', 'Results', *self.get_param_names()])
        self.data['Iteration'] = self.data['Iteration'].astype(int)
        self.data['__sample_index__'] = self.data['__sample_index__'].astype(int)

        self.n_dimensions = len(self.param_info)

        iteration = 0

        N = self.samples_per_iteration if self.max_iterations > 1 else self.samples_final_iteration
        initial_samples = pd.DataFrame(np.random.rand(N, self.n_dimensions), columns=self.param_names)

        # Scale
        for param_name, v in self.param_info.iterrows():
            initial_samples[param_name] = v['Min'] + initial_samples[param_name] * (v['Max'] - v['Min'])
        initial_samples.index.name = 'Sample'
        initial_samples.reset_index(inplace=True)
        initial_samples['Train'] = False
        initial_samples.loc[
            np.random.binomial(n=1, p=self.training_frac, size=initial_samples.shape[0]) == 1, 'Train'] = True
        initial_samples = initial_samples.apply(self.constrain_sample_fn, axis=1)

        self.add_samples(initial_samples, iteration)

        self.set_gpc_vec(iteration, None)  # No GPC for iter 0

        return initial_samples[self.param_names]

[docs]    def choose_samples_via_history_matching(self, iteration):
        train = self.data.loc[self.data['Train']]
        if self.num_past_iterations_to_include_in_metamodel > 0:
            train = train.loc[train['Iteration'] >= iteration - self.num_past_iterations_to_include_in_metamodel]

        # Fit the emulator (GPC)
        gpc = GPC(self.param_names, 'Results', train, self.param_info,
                  kernel_mode='RBF',
                  kernel_params=self.theta_guess,  # Sigma_f^2, lengthscale_x^2, lengthscale_y^2, ...
                  verbose=False,
                  debug=False
                  )

        self.set_gpc_vec(iteration, gpc)

        # Optimize hyperparameters
        optim = self.gpc_vec[iteration].optimize_hyperparameters(
            x0=self.theta_guess,
            bounds=self.bounds,
            eps=1e-3,
            disp=False,
            maxiter=15000
        )
        self.theta_guess = optim.x  # Set result as guess for nex titeration

        emulation_results = self.gpc_vec[iteration].evaluate(self.data)
        emulation_results['Iteration'] = iteration - 1
        emulation_results['Sample'] = self.data['Sample']
        self.emulation = pd.concat([self.emulation, emulation_results])

        # Refocusing
        next_samples = pd.DataFrame(columns=self.param_names + ['Implausible'])
        accepted = 0
        tried = 0
        self.for_plotting = pd.DataFrame(columns=self.param_names)

        N = self.samples_per_iteration if iteration < self.max_iterations - 1 else self.samples_final_iteration
        while next_samples.shape[0] < N:
            n = N - next_samples.shape[0]
            if tried > 0 and accepted > 0:
                n = int(np.ceil(n / (accepted / tried)))
            proposal = pd.DataFrame(np.random.rand(n, 2), columns=self.param_names)

            # Scale
            for param_name, v in self.param_info.iterrows():
                proposal[param_name] = v['Min'] + proposal[param_name] * (v['Max'] - v['Min'])

            # User constraint fn
            proposal = proposal.apply(self.constrain_sample_fn, axis=1)

            proposal['Implausible'] = False
            proposal['Max_Implausibility'] = -1  # For plotting
            for it in reversed(range(1, iteration + 1)):
                # TODO: Only evaluate non-implausible points to save time, although will degrade plotting
                ret = self.gpc_vec[it].evaluate(proposal)
                proposal['Implausibility_%d' % it] = np.sqrt(
                    (ret['Mean'] - self.target_success_probability) ** 2 / ret['Var'])
                proposal['Implausibile_%d' % it] = proposal['Implausibility_%d' % it] > self.implausibility_threshold
                proposal['Implausible'] = proposal['Implausible'] | proposal['Implausibile_%d' % it]
                proposal['Max_Implausibility'] = pd.concat(
                    [proposal['Max_Implausibility'], proposal['Implausibility_%d' % it]], axis=1).max(
                    axis=1)  # Better way?

            self.for_plotting = self.for_plotting.append(proposal[self.param_names + ['Max_Implausibility']],
                                                         ignore_index=True)
            new_samples = proposal.loc[~proposal['Implausible']]
            next_samples = next_samples.append(proposal.loc[~proposal['Implausible']])
            tried = tried + n
            accepted = accepted + new_samples.shape[0]
            accepted_percent = 100 * accepted / tried
            user_logger.info('Found %d new samples, now have %d of %d. Acceptance rate is %.0f%%' % (
                new_samples.shape[0], next_samples.shape[0], N, accepted_percent))

        self.hyperparameters.loc[iteration] = np.append(self.gpc_vec[iteration].theta,
                                                        [optim['fun'], accepted_percent, optim['success']])

        next_samples = next_samples.iloc[:N]  # Trim if needed
        next_samples['Iteration'] = iteration
        next_samples['Train'] = False
        next_samples.loc[np.random.binomial(n=1, p=self.training_frac, size=next_samples.shape[0]) == 1, 'Train'] = True
        n = self.data.shape[0]
        next_samples['Sample'] = list(range(n + 1, n + next_samples.shape[0] + 1))
        self.add_samples(next_samples, iteration)

        return next_samples[self.param_names]

[docs]    def end_condition(self):
        # TODO: Stopping condition ... possibly based on reductions in grown of implausible volume
        logger.info('Continuing iterations ...')
        return False

[docs]    def get_final_samples(self):
        """
        Return some number of samples from the residual non-implausible area:
        """

        return {'final_samples': self.prep_for_dict(self.data)}

[docs]    def prep_for_dict(self, df):
        return df.where(~df.isnull(), other=None).to_dict(orient='list')

[docs]    def get_state(self):

        separatrix_state = dict(
            implausibility_threshold=self.implausibility_threshold,
            target_success_probability=self.target_success_probability,
            num_past_iterations_to_include_in_metamodel=self.num_past_iterations_to_include_in_metamodel,
            training_frac=self.training_frac,
            samples_per_iteration=self.samples_per_iteration,
            samples_final_iteration=self.samples_final_iteration,
            n_dimensions=self.n_dimensions,
            params=self.prep_for_dict(self.param_info.reset_index()),

            hyperparameters=self.prep_for_dict(self.hyperparameters),
            data=self.prep_for_dict(self.data),
            data_dtypes={name: str(data.dtype) for name, data in self.data.items()},

            emulation=self.prep_for_dict(self.emulation),
            emulation_dtypes={name: str(data.dtype) for name, data in self.emulation.items()},

            for_plotting=self.prep_for_dict(self.for_plotting),
            max_iterations=self.max_iterations,

            gpc_vec=[g.save() if g else None for g in self.gpc_vec]
        )
        return separatrix_state

[docs]    def set_state(self, state, iteration):

        self.implausibility_threshold = state['implausibility_threshold']
        self.target_success_probability = state['target_success_probability']
        self.num_past_iterations_to_include_in_metamodel = state['num_past_iterations_to_include_in_metamodel']
        self.training_frac = state['training_frac']
        self.samples_per_iteration = state['samples_per_iteration']
        self.samples_final_iteration = state['samples_final_iteration']
        self.n_dimensions = state['n_dimensions']

        self.param_info = pd.DataFrame.from_dict(state['params'], orient='columns').set_index('Name')

        self.max_iterations = state['max_iterations']

        self.hyperparameters = pd.DataFrame.from_dict(state['hyperparameters'], orient='columns')
        self.emulation = pd.DataFrame.from_dict(state['emulation'], orient='columns')

        data_dtypes = state['data_dtypes']
        self.data = pd.DataFrame.from_dict(state['data'], orient='columns')
        for c in self.data.columns:  # Argh
            self.data[c] = self.data[c].astype(data_dtypes[c])

        self.for_plotting = pd.DataFrame.from_dict(state['for_plotting'], orient='columns')

        self.gpc_vec = [GPC.from_dict(config) if config else None for config in state['gpc_vec']]

        self.need_resolve = True

[docs]    def get_param_names(self):
        return self.param_names