import math
import numpy as np
from scipy.special import gammaln
"""
Functions below compare "ref" and "sim" columns of a pandas.DataFrame
"""
[docs]def dirichlet_multinomial_pandas(df):
# Get the number of total observations aggregated over the last level in MultiIndex
# e.g. if levels are ['Channel', 'Season', 'Age Bin', 'PfPR Bin']
# keep first three in index, while summing over the last level.
sum_levels = df.index.names[:-1]
n_obs = df.sum(level=sum_levels)
n_categories = len(df.index.levels[-1])
n_obs['LL'] = gammaln(n_obs.ref + 1) + gammaln(n_obs.sim) - gammaln(n_obs.ref + n_obs.sim + n_categories) + gammaln(
df.ref + df.sim + 1).sum(level=sum_levels) - gammaln(df.sim + 1).sum(level=sum_levels) - gammaln(df.ref + 1).sum(level=sum_levels)
return n_obs.LL.mean() / n_categories
[docs]def gamma_poisson_pandas(df):
ll = gammaln(df.ref.Observations + df.sim.Observations + 1) - gammaln(df.ref.Observations + 1) - gammaln(df.sim.Observations + 1)
ix = df.ref.Trials > 0
ll.loc[ix] += (df.loc[ix].ref.Observations + 1) * np.log(df.loc[ix].ref.Trials)
ix = df.sim.Trials > 0
ll.loc[ix] += (df.loc[ix].sim.Observations + 1) * np.log(df.loc[ix].sim.Trials)
ix = (df.ref.Trials > 0) & (df.sim.Trials > 0)
ll.loc[ix] -= (df.loc[ix].ref.Observations + df.loc[ix].sim.Observations + 1) * np.log(df.loc[ix].ref.Trials + df.loc[ix].sim.Trials)
return ll.mean()
[docs]def beta_binomial_pandas(df):
ll = gammaln(df.ref.Trials + 1) + gammaln(df.sim.Trials + 2) - gammaln(df.ref.Trials + df.sim.Trials + 2) + gammaln(
df.ref.Observations + df.sim.Observations + 1) + gammaln(
df.ref.Trials - df.ref.Observations + df.sim.Trials - df.sim.Observations + 1) - gammaln(df.ref.Observations + 1) - gammaln(
df.ref.Trials - df.ref.Observations + 1) - gammaln(df.sim.Observations + 1) - gammaln(df.sim.Trials - df.sim.Observations + 1)
return ll.mean()
"""
Functions below were ported by J.Gerardin
from K.McCarthy Matlab CalibTool versions
"""
[docs]def dirichlet_multinomial(raw_data, sim_data):
num_age_bins = len(raw_data[:, 0])
num_cat_bins = len(raw_data[0, :])
raw_nobs = [sum(raw_data[i, :]) for i in range(num_age_bins)]
sim_nobs = [sum(sim_data[i, :]) for i in range(num_age_bins)]
LL = 0.
for agebin in range(num_age_bins):
LL += gammaln(raw_nobs[agebin] + 1)
LL += gammaln(sim_nobs[agebin])
LL -= gammaln(raw_nobs[agebin] + sim_nobs[agebin] + num_cat_bins)
for catbin in range(num_cat_bins):
LL += gammaln(raw_data[agebin, catbin] + sim_data[agebin, catbin] + 1)
LL -= gammaln(sim_data[agebin, catbin] + 1)
LL -= gammaln(raw_data[agebin, catbin] + 1)
LL /= (num_age_bins * num_cat_bins)
return LL
[docs]def dirichlet_single(raw_data, sim_data):
num_cat_bins = len(raw_data)
raw_nobs = sum(raw_data)
sim_nobs = sum(sim_data)
ll = 0.
ll += gammaln(raw_nobs + 1)
ll += gammaln(sim_nobs + num_cat_bins)
ll -= gammaln(raw_nobs + sim_nobs + num_cat_bins)
for catbin in range(num_cat_bins):
ll += gammaln(raw_data[catbin] + sim_data[catbin] + 1)
ll -= gammaln(sim_data[catbin] + 1)
ll -= gammaln(raw_data[catbin] + 1)
ll /= num_cat_bins
return ll
[docs]def beta_binomial(raw_nobs, sim_nobs, raw_data, sim_data, return_mean=True):
num_bins = len(raw_data)
ll = 0.
for this_bin in range(num_bins):
ll += gammaln(raw_nobs[this_bin] + 1)
ll += gammaln(sim_nobs[this_bin] + 2)
ll -= gammaln(raw_nobs[this_bin] + sim_nobs[this_bin] + 2)
ll += gammaln(raw_data[this_bin] + sim_data[this_bin] + 1)
ll += gammaln(raw_nobs[this_bin] - raw_data[this_bin] + sim_nobs[this_bin] - sim_data[this_bin] + 1)
ll -= gammaln(raw_data[this_bin] + 1)
ll -= gammaln(raw_nobs[this_bin] - raw_data[this_bin] + 1)
ll -= gammaln(sim_data[this_bin] + 1)
ll -= gammaln(sim_nobs[this_bin] - sim_data[this_bin] + 1)
if num_bins != 0 and return_mean:
ll /= num_bins
return ll
[docs]def gamma_poisson(raw_nobs, sim_nobs, raw_data, sim_data, return_mean=True):
num_bins = len(raw_data)
ll = 0.
for this_bin in range(num_bins):
if raw_nobs[this_bin] > 0:
ll += (raw_data[this_bin] + 1) * math.log(raw_nobs[this_bin])
if sim_nobs[this_bin] > 0:
ll += (sim_data[this_bin] + 1) * math.log(sim_nobs[this_bin])
if raw_nobs[this_bin] + sim_nobs[this_bin] > 0:
ll -= (raw_data[this_bin] + sim_data[this_bin] + 1) * math.log(raw_nobs[this_bin] + sim_nobs[this_bin])
ll += gammaln(raw_data[this_bin] + sim_data[this_bin] + 1)
ll -= gammaln(raw_data[this_bin] + 1)
ll -= gammaln(sim_data[this_bin] + 1)
if num_bins != 0 and return_mean:
ll /= num_bins
return ll
"""
Other weighting functions
"""
[docs]def euclidean_distance(raw_data, sim_data):
num_obs = len(raw_data)
return math.sqrt(sum([(raw_data[x] - sim_data[x]) ** 2 for x in range(num_obs)])) * -1
[docs]def euclidean_distance_pandas(df):
return math.sqrt(np.sum((np.array(df.ref) - np.array(df.sim)) ** 2)) * -1
[docs]def weighted_squares(raw_data, sim_data):
num_obs = len(raw_data)
return math.sqrt(sum([(raw_data[x] - sim_data[x]) ** 2 / raw_data[x] for x in range(num_obs)])) * -1