Source code for idmtools_calibra.resamplers.cramer_rao_resampler

import itertools
import os
from logging import getLogger

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA

from idmtools_calibra.algorithms.fisher_inf_matrix import trunc_gauss, compute_fisher_inf_matrix
from idmtools_calibra.resamplers.base_resampler import BaseResampler
from idmtools_calibra.resamplers.calibration_point import CalibrationPoint

logger = getLogger(__name__)
user_logger = getLogger('user')


[docs]class CramerRaoResampler(BaseResampler):
    def __init__(self, calib_manager=None, **kwargs):
        """
        Args:
            n_resampling_points: The number of resampled points to generate
            kwargs: These are arguments passed directly to the underlying resampling routine.
        """
        super().__init__(calib_manager)
        # self.n_resampling_points = n_resampling_points # the number of points to resample/generate
        self.resample_kwargs = kwargs

[docs]    def resample(self, calibrated_points, selection_values, initial_calibration_points):
        """
        Takes in a list of 1+ Point objects and returns method-specific resampled points as a list of Point objects
        The resultant Point objects should be copies of the input Points BUT with Value overridden on each, e.g.::

            new_point = Point.copy(one_of_the_input_calibrated_points)
            for param in new_point.list_params():
              new_point.set_param_value(param, value=SOME_NEW_VALUE)

        Args:
            calibrated_points: input points for this resampling method

        Returns: 
            A list of resampled Point objects
        """

        # selection_values: a DataFrame with columns relevant to selection of calibrated_points

        center_point = initial_calibration_points[0]

        # convert input points to DataFrames
        calibrated_points_df = []
        for i in range(len(calibrated_points)):
            new_item = calibrated_points[i].to_value_dict(parameter_type=CalibrationPoint.DYNAMIC)
            calibrated_points_df.append(new_item)
        calibrated_points_df = pd.DataFrame(calibrated_points_df)
        original_column_names = calibrated_points_df.columns
        calibrated_points_df = selection_values.join(calibrated_points_df)

        # ck4, debugging only, this block
        filename = os.path.join(self.output_location, 'cr-calibrated-points.csv')  # C
        calibrated_points_df.to_csv(filename)

        # temporary, generic column names for the actual parameter names
        # theta_column_names = ['theta%d'%i for i in range(len(original_column_names))]
        # temp_columns = list(selection_values.columns) + theta_column_names
        # calibrated_points_df.columns = temp_columns

        # same as calibrated_points_df but with a LL column on the end
        likelihood_df = pd.DataFrame([{'LL': point.likelihood} for point in calibrated_points])
        likelihood_df = calibrated_points_df.join(likelihood_df)

        # ck4, debugging only, this block
        filename = os.path.join(self.output_location, 'cr-calibrated-points-ll.csv')  # E
        likelihood_df.to_csv(filename)

        # Do the resampling

        # center_point is a list of param values at the center point, must be ordered exactly as calibrated_points_df column-wise
        # obtain min/max of parameter ranges to force results to be within them
        minimums = center_point.get_attribute(key='Min', parameter_type=CalibrationPoint.DYNAMIC)
        maximums = center_point.get_attribute(key='Max', parameter_type=CalibrationPoint.DYNAMIC)
        names = center_point.get_attribute(key='Name', parameter_type=CalibrationPoint.DYNAMIC)

        center_point_as_list = list(
            pd.DataFrame([center_point.to_value_dict(parameter_type=CalibrationPoint.DYNAMIC)]).as_matrix()[0])

        fisher_inf_matrix = compute_fisher_inf_matrix(center_point_as_list, likelihood_df, names)
        covariance = np.linalg.pinv(fisher_inf_matrix)

        # the 2 principal components using PCA
        scaled_covariance = np.dot(
            np.dot(np.asmatrix(np.diag(1 / (np.array(maximums) - np.array(minimums)))), covariance),
            np.asmatrix(np.diag(1 / (np.array(maximums) - np.array(minimums)))))
        pca = PCA(n_components=2)
        pca.fit(scaled_covariance)
        user_logger.info('covariance (scaled): \n', scaled_covariance)
        user_logger.info('principal axes: \n', pca.components_)
        user_logger.info('variance of first 2 principal components: \n', pca.explained_variance_)

        resampled_points_list = trunc_gauss(center_point_as_list, covariance, minimums, maximums,
                                            **self.resample_kwargs)

        # convert resampled points to a list of CalibrationPoint objects (multiple steps here)
        resampled_points_df = pd.DataFrame(data=resampled_points_list, columns=original_column_names)

        # attach static parameters
        names = center_point.get_attribute('Name', parameter_type=CalibrationPoint.STATIC)
        values = center_point.get_attribute('Value', parameter_type=CalibrationPoint.STATIC)
        for i in range(len(names)):
            resampled_points_df = resampled_points_df.assign(**{str(names[i]): values[i]})
        resampled_points_df.sort_index(axis=1, inplace=True)

        # ck4, debugging only
        filename = os.path.join(self.output_location, 'cr-resampled-points.csv')  # J
        resampled_points_df.to_csv(filename)

        resampled_points = self._transform_df_points_to_calibrated_points(center_point,
                                                                          resampled_points_df)

        # ck4, debugging only
        from idmtools_calibra.resamplers.calibration_points import CalibrationPoints
        filename = os.path.join(self.output_location, 'cr-resampled-points-transformed.csv')  # K
        points = CalibrationPoints(resampled_points)
        points.write(filename)

        # this can be anything; will be made available for use in post_analysis() method in the from_resample argument
        for_post_analysis = np.concatenate((covariance, np.array(center_point_as_list)[None, :]), axis=0)

        # ck4, Verification, for review
        # J is the same as K, except for an additional indexing column in J
        # D is the same as E except for the LL column and end-of-line whitespace chars
        # E is the same as LLdata.csv from the RandomPerturbationResampler EXCEPT different param/LL column order (same values)
        #    -- This should be fine, as order preservation is only needed across (in/out) of CR resampling due to column renaming.
        # Parameter number ranges for input (cr-calibrated-points.csv) and output (cr-resampled-points.csv) are similar,
        #    e.g. col0 ~ 1.25, col1 ~ 2000, col2 ~ 0.65, col3 ~ 25

        # return reampled points
        return resampled_points, for_post_analysis

[docs]    def post_analysis(self, resampled_points, analyzer_results, from_resample):
        super().post_analysis(resampled_points, analyzer_results, from_resample=from_resample)
        output_filename = os.path.join(self.output_location, 'cr-resampled-points-ll.csv')
        resampled_points_ll_df = pd.DataFrame([rp.to_value_dict(include_likelihood=True) for rp in resampled_points])
        resampled_points_ll_df.to_csv(output_filename)

        covariance_matrix = from_resample[0:-1]
        center_calibrated = from_resample[-1]

        """ plotting """

        def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
            """
            Plots an `nstd` sigma error ellipse based on the specified covariance
            matrix (`cov`). Additional keyword arguments are passed on to the
            ellipse patch artist.

            Args:
                cov : The 2x2 covariance matrix to base the ellipse on
                pos : The location of the center of the ellipse. Expects a 2-element
                    sequence of [x0, y0].
                nstd : The radius of the ellipse in numbers of standard deviations.
                    Defaults to 2 standard deviations.
                ax : The axis that the ellipse will be plotted on. Defaults to the
                    current axis.

                Additional keyword arguments are pass on to the ellipse patch.

            Returns:
                A matplotlib ellipse artist
            """

            def eigsorted(cov):
                vals, vecs = np.linalg.eigh(cov)
                order = vals.argsort()[::-1]
                return vals[order], vecs[:, order]

            if ax is None:
                ax = plt.gca()

            vals, vecs = eigsorted(cov)
            theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))

            # Width and height are "full" widths, not radius
            width, height = 2 * nstd * np.sqrt(vals)
            ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)

            ax.add_artist(ellip)
            return ellip

        def scatterplot_matrix(samples, names, range_min, range_max, Covariance, center, **kwargs):
            """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
            against other rows, resulting in a nrows by nrows grid of subplots with the
            diagonal subplots labeled with "names".  Additional keyword arguments are
            passed on to matplotlib's "plot" command. Returns the matplotlib figure
            object containg the subplot grid."""
            numvars, numdata = samples.shape
            fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(8, 8))
            fig.subplots_adjust(right=0.8, hspace=0.05, wspace=0.05)

            for ax in axes.flat:
                # Hide all ticks and labels
                ax.xaxis.set_visible(False)
                ax.yaxis.set_visible(False)

                # Set up ticks only on one side for the "edge" subplots...
                if ax.is_first_col():
                    ax.yaxis.set_ticks_position('left')
                if ax.is_last_col():
                    ax.yaxis.set_ticks_position('right')
                if ax.is_first_row():
                    ax.xaxis.set_ticks_position('top')
                if ax.is_last_row():
                    ax.xaxis.set_ticks_position('bottom')

            # Plot the data.
            for i, j in zip(*np.triu_indices_from(axes, k=1)):
                for x, y in [(i, j), (j, i)]:
                    scatterPlot = axes[x, y].scatter(samples[y], samples[x], **kwargs)
                    row_idx = np.array([y, x])
                    col_idx = np.array([y, x])
                    plot_cov_ellipse(Covariance[row_idx[:, None], col_idx], center[[y, x]], nstd=2, ax=axes[x, y],
                                     alpha=0.3, color='red')
                    if not (range_min is None):
                        axes[x, y].set_xlim(range_min[y], range_max[y])
                    if not (range_max is None):
                        axes[x, y].set_ylim(range_min[x], range_max[x])

            cax = fig.add_axes([0.85, 0.2, 0.05, 0.6])
            fig.colorbar(scatterPlot, cax=cax)
            plt.suptitle('Cramer Rao samples')

            # Label the diagonal subplots...
            for i, label in enumerate(names):
                axes[i, i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                                    ha='center', va='center')

            # Turn on the proper x or y axes ticks.
            for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
                axes[j, i].xaxis.set_visible(True)
                axes[i, j].yaxis.set_visible(True)

            return fig

        resampled_points_dynamic = pd.DataFrame([rp.to_value_dict(parameter_type=CalibrationPoint.DYNAMIC,
                                                                  include_likelihood=True) for rp in resampled_points])

        pp = sns.pairplot(resampled_points_dynamic[resampled_points_dynamic.columns[0:-1]], size=1.8, aspect=1.8,
                          plot_kws=dict(edgecolor="k", linewidth=0.5), diag_kind="kde", diag_kws=dict(shade=True))

        fig = pp.fig
        fig.subplots_adjust(top=0.93, wspace=0.3)
        fig.savefig(os.path.join(self.output_location, 'samples_pairs.pdf'))
        fig.clf()
        plt.close(fig)
        del pp, fig

        # just to test plot the first two dynamic parameters
        fig, ax = plt.subplots()
        x = resampled_points_dynamic[resampled_points_dynamic.columns[0]]
        y = resampled_points_dynamic[resampled_points_dynamic.columns[1]]
        colors = resampled_points_dynamic[resampled_points_dynamic.columns[-1]]
        plt.scatter(x, y, c=colors, alpha=0.3, cmap='viridis')
        plt.xlabel(resampled_points_dynamic.columns.values[0])
        plt.ylabel(resampled_points_dynamic.columns.values[1])
        plt.title('log-likelihood of the samples')
        plt.xlim([rp._filter_parameters(parameter_type=CalibrationPoint.DYNAMIC) for rp in resampled_points][0][0].min,
                 [rp._filter_parameters(parameter_type=CalibrationPoint.DYNAMIC) for rp in resampled_points][0][0].max)
        plt.ylim([rp._filter_parameters(parameter_type=CalibrationPoint.DYNAMIC) for rp in resampled_points][0][1].min,
                 [rp._filter_parameters(parameter_type=CalibrationPoint.DYNAMIC) for rp in resampled_points][0][1].max)
        plt.colorbar()
        plt.savefig(os.path.join(self.output_location, 'samples_pairs_LL.pdf'))
        fig.clf()
        plt.close(fig)
        del ax, fig

        # scatter plot matrix
        minimums = resampled_points[0].get_attribute(key='Min', parameter_type=CalibrationPoint.DYNAMIC)
        maximums = resampled_points[0].get_attribute(key='Max', parameter_type=CalibrationPoint.DYNAMIC)
        fig = scatterplot_matrix(np.array(resampled_points_dynamic[resampled_points_dynamic.columns[0:-1]]).T,
                                 list(resampled_points_dynamic.columns.values[0:-1]), minimums, maximums,
                                 covariance_matrix, center_calibrated, c=colors, alpha=0.3, cmap='Greens')
        fig.savefig(os.path.join(self.output_location, 'scatterplot_matrix_entireRange.pdf'))
        fig.clf
        del fig

        fig = scatterplot_matrix(np.array(resampled_points_dynamic[resampled_points_dynamic.columns[0:-1]]).T,
                                 list(resampled_points_dynamic.columns.values[0:-1]), range_min=None, range_max=None,
                                 Covariance=covariance_matrix, center=center_calibrated,
                                 c=colors, alpha=0.3, cmap='Greens')
        fig.savefig(os.path.join(self.output_location, 'scatterplot_matrix_NoScale.pdf'))
        fig.clf
        del fig