'''
Define classes and functions for the Experiment class (running sims and comparing them to data)
'''
import math
import yaml
import numpy as np
import pylab as pl
import pandas as pd
import sciris as sc
from .settings import options as fpo
from . import defaults as fpd
from . import parameters as fpp
from . import sim as fps
__all__ = ['Experiment', 'Fit', 'compute_gof', 'diff_summaries']
# ...more settings
min_age = 15
max_age = 50
bin_size = 5
first_birth_age = 25 # age to start assessing first birth age in model
mpy = 12 # Months per year
# Flags for what to run
default_flags = sc.objdict(
popsize = 1, # Population size and growth over time on whole years, adjusted for n number of agents; 'pop_size'
ageparity = 1, # Population distribution of agents in each age/parity bin (age-parity plot); 'ageparity'
first_birth = 1, # Age at first birth mean with standard deviation; 'age_first_birth'
birth_space = 1, # Birth spacing both in bins and mean with standard deviation; 'spacing'
mcpr = 1, # Modern contraceptive prevalence; 'mcpr'
methods = 1, # Overall percentage of method use and method use among users; 'methods'
mmr = 1, # Maternal mortality ratio at end of sim in model vs data; 'maternal_mortality_ratio'
infant_m = 1, # Infant mortality rate at end of sim in model vs data; 'infant_mortality_rate'
cdr = 1, # Crude death rate at end of sim in model vs data; 'crude_death_rate'
cbr = 1, # Crude birth rate (per 1000 inhabitants); 'crude_birth_rate'
tfr = 1, # Total fertility rate
asfr = 1, # Age-specific fertility rate
empowerment = 0, # Empowerment metrics, i.e. paid employment and education params
)
[docs]
class Experiment(sc.prettyobj):
'''
Class for running calibration to data. Effectively, it runs a single sim and
compares it to data.
Args:
pars (dict): dictionary of parameters
flags (dict): which analyses to run; see ``fp.experiment.default_flags`` for options
label (str): label of experiment
kwargs (dict): passed into pars
'''
def __init__(self, pars=None, flags=None, label=None, **kwargs):
self.flags = sc.mergedicts(default_flags, flags, _copy=True) # Set flags for what gets run
self.pars = pars if pars else fpp.pars(**kwargs)
self.location = self.pars['location']
self.model = sc.objdict()
self.data = sc.objdict()
self.method_keys = None
self.initialized = False
self.label = label
return
[docs]
def load_data(self, key, **kwargs):
''' Load data from various formats '''
files = self.pars['filenames']
path = files['base'] / files[key]
if path.suffix == '.obj':
data = sc.load(path, **kwargs)
elif path.suffix == '.json':
data = sc.loadjson(path, **kwargs)
elif path.suffix == '.csv':
data = pd.read_csv(path, **kwargs)
if str(path).endswith('region.csv'):
region = self.location
if region == 'benishangul_gumuz':
region = region.replace('_', '-').title() # Replace underscore with dash and capitalize each word
elif region == 'snnpr':
region = 'SNNPR'
else:
region = region.replace('_', ' ').title()
data = data.loc[data['region'] == region]
elif path.suffix == '.yaml':
with open(path) as f:
data = yaml.safe_load(f, **kwargs)
else:
errormsg = f'Unrecognized file format for: {path}'
raise ValueError(errormsg)
return data
def pop_growth_rate(self, years, population):
growth_rate = np.zeros(len(years) - 1)
for i in range(len(years)):
if population[i] == population[-1]:
break
growth_rate[i] = ((population[i + 1] - population[i]) / population[i]) * 100
return growth_rate
[docs]
def run_model(self, pars=None, **kwargs):
''' Create the sim and run the model '''
if not self.initialized:
self.extract_data()
if pars is None:
pars = self.pars
self.sim = fps.Sim(pars=pars, **kwargs)
self.sim.run()
return
def extract_model(self):
sres = self.sim.results
if self.flags.popsize: self.model_pop_size(sres)
if self.flags.mcpr: self.model_mcpr(sres)
if self.flags.mmr: self.model_mmr(sres)
if self.flags.infant_m: self.model_infant_mortality_rate(sres)
if self.flags.cdr: self.model_crude_death_rate(sres)
if self.flags.cbr: self.model_crude_birth_rate(sres)
if self.flags.tfr: self.model_data_tfr(sres)
if self.flags.asfr: self.model_data_asfr(sres)
return
def model_pop_size(self, sres=None):
self.model['pop_size'] = sres['pop_size']
self.model['pop_years'] = sres['tfr_years']
model_growth_rate = self.pop_growth_rate(self.model['pop_years'], self.model['pop_size'])
self.model['pop_growth_rate'] = model_growth_rate
return
def model_mcpr(self, sres=None):
model = {'years': sres['t'], 'mcpr': sres['mcpr']}
model_frame = pd.DataFrame(model)
# Filter to matching years
data_years = self.data['mcpr_years'].tolist()
filtered_model = model_frame.loc[model_frame.years.isin(data_years)]
model_mcpr = filtered_model['mcpr'].to_numpy()
mcpr_years = filtered_model['years'].to_numpy()
self.model['mcpr'] = model_mcpr*100 # Since data is in 100
self.model['mcpr_years'] = mcpr_years
return
[docs]
def model_mmr(self, sres=None):
'''
Calculate maternal mortality in model over most recent 3 years
'''
maternal_deaths = np.sum(sres['maternal_deaths'][-mpy * 3:])
births_last_3_years = np.sum(sres['births'][-mpy * 3:])
self.model['maternal_mortality_ratio'] = sc.safedivide(maternal_deaths, births_last_3_years) * 100000
return
def model_infant_mortality_rate(self, sres=None):
infant_deaths = np.sum(sres['infant_deaths'][-mpy:])
births_last_year = np.sum(sres['births'][-mpy:])
self.model['infant_mortality_rate'] = sc.safedivide(infant_deaths, births_last_year) * 1000
return
def model_crude_death_rate(self, sres=None):
total_deaths = np.sum(sres['deaths'][-mpy:]) + \
np.sum(sres['infant_deaths'][-mpy:]) + \
np.sum(sres['maternal_deaths'][-mpy:])
self.model['crude_death_rate'] = (total_deaths / sres['pop_size'][-1]) * 1000
return
def model_crude_birth_rate(self, sres=None):
births_last_year = np.sum(sres['births'][-mpy:])
self.model['crude_birth_rate'] = (births_last_year / sres['pop_size'][-1]) * 1000
return
def model_data_tfr(self, sres=None):
# Extract tfr over time in data - keep here to ignore dhs data if not using tfr for calibration
tfr = self.load_data('tfr') # From DHS
self.data['tfr_years'] = tfr['year'].to_numpy()
self.data['total_fertility_rate'] = tfr['tfr'].to_numpy()
self.model['tfr_years'] = sres['tfr_years']
self.model['total_fertility_rate'] = sres['tfr_rates']
return
def model_data_asfr(self, sres=None, ind=-1):
# Extract ASFR for different age bins
asfr = self.load_data('asfr') # From DHS
age_bins = list(asfr.columns)
age_bins.remove('year')
# Save asfr and asfr_bins to data dictionary
year_data = asfr[asfr['year'] == self.pars['end_year']]
if 'region' in age_bins:
age_bins.remove('region')
self.data['asfr'] = year_data.drop(['year', 'region'], axis=1).values.tolist()[0]
else:
self.data['asfr'] = year_data.drop(['year'], axis=1).values.tolist()[0]
self.data['asfr_bins'] = age_bins
# Model extraction
age_bins = list(fpd.age_bin_map.keys())
self.model['asfr_bins'] = age_bins
self.model['asfr'] = []
for ab in age_bins:
val = sres['asfr'][ab][ind] # Only use one index (default: last) CK: TODO: match year automatically
self.model['asfr'].append(val)
# Check
assert self.data['asfr_bins'] == self.model['asfr_bins'], f'ASFR data age bins do not match sim: {sc.strjoin(age_bins)}'
return
def extract_ageparity(self):
# Set up
age_keys = list(fpd.age_bin_map.keys())[1:]
age_bins = pl.arange(min_age, max_age, bin_size)
parity_bins = pl.arange(0, 7) # Plot up to parity 6
n_age = len(age_bins)
n_parity = len(parity_bins)
# Load data TO NOTE: By default, the dataset that is used for comparison with the model is the last dataset (
# typically the most recent) in the ageparity file
sky_raw_data = self.load_data('ageparity')
dataset = sky_raw_data.iloc[-1]['dataset']
sky_raw_data = sky_raw_data[sky_raw_data.dataset == dataset]
# sky_parity = sky_raw_data[2].to_numpy() # Not used currently
sky_parity = sky_raw_data['parity'].to_numpy()
sky_props = sky_raw_data['percentage'].to_numpy()
sky_arr = sc.odict()
sky_arr['Data'] = pl.zeros((len(age_keys), len(parity_bins)))
for age, row in sky_raw_data.iterrows():
if row.age in age_keys and row.parity < n_parity:
age_ind = age_keys.index(row.age)
sky_arr['Data'][age_ind, row.parity] = row.percentage
# Extract from model
ppl = self.sim.people
alive_f = ppl.filter(ppl.alive * ~ppl.sex)
m_age_bins = np.append(age_bins,max_age)
m_parity_bins = np.append(parity_bins, parity_bins[-1]+1)
counts, _, _ = np.histogram2d(alive_f.age, alive_f.parity, bins=(m_age_bins, m_parity_bins))
sky_arr['Model'] = counts
# Normalize
for key in ['Data', 'Model']:
sky_arr[key] /= sky_arr[key].sum() / 100
self.data['ageparity'] = sky_arr['Data']
self.model['ageparity'] = sky_arr['Model']
self.age_bins = age_bins
self.parity_bins = parity_bins
return
def extract_birth_spacing(self):
# Set up
data_afb = self.load_data('afb')
data_afb = data_afb.sort_values(by='afb')
data_spaces = self.load_data('spacing')
data_spaces = data_spaces.sort_values(by='space_mo')
spacing_bins = sc.odict({'0-12': 0, '12-24': 1, '24-48': 2, '>48': 4}) # Spacing bins in years
data_spacing_counts = sc.odict().make(keys=spacing_bins.keys(), vals=0.0)
# Extract birth spaces from model
ppl = self.sim.people
gt1_birth = ppl.filter(ppl.alive * ~ppl.sex * ppl.parity>1) # Alive women with >1 birth
birth_spaces = np.diff(gt1_birth.birth_ages) # Birth spacings
defined_vals = ~np.isnan(birth_spaces) * (birth_spaces>0) # Find NaNs and twins
model_spacing = birth_spaces[defined_vals] # Remove NaNs and twins
model_spacing_counts, _ = np.histogram(model_spacing, bins=np.append(spacing_bins.values(), 10)) # Bin
model_spacing_counts = model_spacing_counts / model_spacing_counts[:].sum() # Normalize
model_spacing_counts[:] *= 100 # Percentages
# Extract age at first birth from model
any_births = ppl.filter(ppl.alive * ~ppl.sex * ppl.parity>0)
model_age_first = any_births.birth_ages[:,0]
# Extract birth spaces and age at first birth from data
for i, j in data_spaces.iterrows():
space = j['space_mo'] / mpy
ind = sc.findinds(space > spacing_bins[:])[-1]
data_spacing_counts[ind] += j['Freq']
# Normalize dat birth space bin counts to percentages
data_spacing_counts[:] /= data_spacing_counts[:].sum()
data_spacing_counts[:] *= 100
# Extract afb and respective weights data
afb_values = data_afb["afb"].values.tolist()
afb_weights = data_afb["wt"].values.tolist()
# Calculate the cumulative weights and total weight of the afb data
afb_cum_weights = np.cumsum(afb_weights)
afb_total_weight = afb_cum_weights[-1]
# Extract birth spacing and respective frequency data
birth_spacing_values = data_spaces["space_mo"].values.tolist()
birth_spacing_weights = data_spaces["Freq"].values.tolist()
# Calculate the cumulative weights and total weight of the birth spacing data
birth_spacing_cum_weights = np.cumsum(birth_spacing_weights)
birth_spacing_total_weight = birth_spacing_cum_weights[-1]
data_age_first_stats = np.array([np.interp((.25 * afb_total_weight), afb_cum_weights, afb_values),
np.interp((.50 * afb_total_weight), afb_cum_weights, afb_values),
np.interp((.75 * afb_total_weight), afb_cum_weights, afb_values)])
data_spacing_stats = np.array([np.interp((.25 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values),
np.interp((.50 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values),
np.interp((.75 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values)])
# Save to dictionary
self.data['spacing_bins'] = np.array(data_spacing_counts.values())
self.data['spacing_stats'] = data_spacing_stats
self.data['age_first_stats'] = data_age_first_stats
try:
model_spacing_stats = np.array([np.percentile(model_spacing, 25),
np.percentile(model_spacing, 50),
np.percentile(model_spacing, 75)])
model_age_first_stats = np.array([np.percentile(model_age_first, 25),
np.percentile(model_age_first, 50),
np.percentile(model_age_first, 75)])
except Exception as E: # pragma: nocover
print(f'Could not calculate birth spacing, returning zeros: {E}')
model_spacing_counts = {k: 0 for k in spacing_bins.keys()}
model_spacing_stats = np.zeros(data_spacing_stats.shape)
model_age_first_stats = np.zeros(data_age_first_stats.shape)
# Save arrays to dictionary
self.model['spacing_bins'] = np.array(model_spacing_counts)
self.model['spacing_stats'] = model_spacing_stats
self.model['age_first_stats'] = model_age_first_stats
return
def extract_methods(self):
data_method_counts = sc.odict()
model_method_counts = sc.odict()
# Extract from data
data_methods = self.load_data('methods')
for index, row in data_methods.iterrows():
data_method_counts[row['method']] = row['perc']
# Update data method mix using non-user percentage from 'use' file
data_use = self.load_data('use')
if 'region' in data_use.columns:
latest_data = data_use[data_use['year'] == data_use['year'].max()]
data_method_counts['None'] = latest_data.loc[latest_data['var1'] == 0, 'perc'].values[0]
use_freq = (latest_data.loc[latest_data['var1'] == 1, 'perc'].values[0]) / 100
else:
data_method_counts['None'] = data_use.loc[0, 'perc']
use_freq = (data_use.loc[1, 'perc'])/100
for key, value in data_method_counts.items():
value /= 100
if key != 'None':
value *= use_freq
data_method_counts.update({key: value})
# Extract from model
ppl = self.sim.people
alive_f = ppl.filter(ppl.alive * ~ppl.sex)
model_methods = alive_f.method
model_method_counts,_ = np.histogram(model_methods, bins=np.arange(11))
model_method_counts = model_method_counts/model_method_counts.sum()
model_labels = [m.label for m in self.sim.contraception_module.methods.values()]
# Make labels
data_labels = data_method_counts.keys()
for d in range(len(data_labels)):
if data_method_counts[d] > 0.01:
data_labels[d] = f'{data_labels[d]}: {data_method_counts[d] * 100:0.1f}%'
else:
data_labels[d] = ''
for d in range(len(model_labels)):
if model_method_counts[d] > 0.01:
model_labels[d] = f'{model_labels[d]}: {model_method_counts[d] * 100:0.1f}%'
else:
model_labels[d] = ''
self.data['method_counts'] = np.array(data_method_counts.values())
self.model['method_counts'] = np.array(model_method_counts)
return
def extract_education(self):
# Extract education from data
dhs_data_education = self.load_data('education')
data_edu = dhs_data_education[['age', 'edu']].sort_values(by='age')
data_edu = data_edu.query(f"{min_age} <= age < {max_age}").copy()
age_bins = np.arange(min_age, max_age + 1, bin_size)
data_edu['age_group'] = pd.cut(data_edu['age'], bins=age_bins, right=False)
# Calculate mean for each age bin
education_data_grouped = data_edu.groupby('age_group', observed=False)['edu']
self.data['education'] = education_data_grouped.mean().tolist()
# Extract education from model
model_edu_years = {age_bin: [] for age_bin in np.arange(min_age, max_age, bin_size)}
ppl = self.people
for i in range(len(ppl)):
if ppl.alive[i] and not ppl.sex[i] and min_age <= ppl.age[i] < max_age:
age_bin = age_bins[sc.findinds(age_bins <= ppl.age[i])[-1]]
model_edu_years[age_bin].append(ppl.edu_attainment[i])
# Calculate average # of years of educational attainment for each age
model_edu_mean = []
for age_group in model_edu_years:
if len(model_edu_years[age_group]) != 0:
avg_edu = sum(model_edu_years[age_group]) / len(model_edu_years[age_group])
model_edu_mean.append(avg_edu)
else:
model_edu_years[age_group] = 0
self.model['education'] = model_edu_mean
return
[docs]
def compute_fit(self, *args, **kwargs):
''' Compute how good the fit is '''
data = sc.dcp(self.data)
try:
sim = sc.dcp(self.model, die=False) # Sometimes fails with a dict_keys copy error (!)
except:
sim = {k:self.model[k] for k in data.keys()}
for k in data.keys():
data[k] = sc.promotetoarray(data[k])
data[k] = data[k].flatten()
sim[k] = sc.promotetoarray(sim[k])
sim[k] = sim[k].flatten()
self.fit = Fit(data, sim, *args, **kwargs)
pass
[docs]
def post_process_results(self, keep_people=False, compute_fit=True, **kwargs):
''' Compare the model and the data '''
self.extract_model()
if self.flags.ageparity: self.extract_ageparity()
if self.flags.birth_space: self.extract_birth_spacing()
if self.flags.methods: self.extract_methods()
if self.flags.empowerment:
self.extract_employment()
self.extract_education()
# Compute comparison
self.df = self.compare()
# Compute fit
if compute_fit:
self.compute_fit(**kwargs)
return
[docs]
def run(self, pars=None, keep_people=False, compute_fit=True, **kwargs):
''' Run the model and post-process the results '''
self.run_model(pars=pars)
self.post_process_results(keep_people=keep_people, compute_fit=compute_fit, **kwargs)
return self
[docs]
def compare(self):
''' Create and print a comparison between model and data '''
# Check that keys match
data_keys = self.data.keys()
model_keys = self.model.keys()
assert set(data_keys) == set(model_keys), 'Data and model keys do not match'
# Compare the two
comparison = []
for key in data_keys:
dv = self.data[key] # dv = "Data value"
mv = self.model[key] # mv = "Model value"
cmp = sc.objdict(key=key,
d_type=type(dv),
m_type=type(mv),
d_shape=np.shape(dv),
m_shape=np.shape(mv),
d_val='array',
m_val='array')
if sc.isnumber(dv):
cmp.d_val = dv
if sc.isnumber(mv):
cmp.m_val = mv
comparison.append(cmp)
self.comparison_df = pd.DataFrame.from_dict(comparison)
return self.comparison_df
[docs]
def summarize(self, as_df=False):
'''
Convert results to a one-number-per-key summary format. Returns summary,
also saves to self.summary.
Args:
as_df (bool): if True, return a dataframe instead of a dict.
'''
summary = sc.objdict()
summary.model = sc.objdict()
summary.data = sc.objdict()
data = self.data
model = self.model
keys = model.keys()
# Compare the two
for key in keys:
if not (key.endswith('_years') or key.endswith('_bins')):
dv = data[key] # dv = "Data value"
mv = model[key] # mv = "Model value"
if sc.isnumber(mv) and sc.isnumber(dv):
summary.data[key] = dv
summary.model[key] = mv
else:
summary.data[key+'_mean'] = np.mean(dv)
summary.model[key+'_mean'] = np.mean(mv)
self.summary = summary
self.summary_df = pd.DataFrame(summary)
if as_df:
return self.summary.df
else:
return self.summary
[docs]
def to_json(self, filename=None, tostring=False, indent=2, verbose=False, **kwargs):
'''
Export results as JSON.
Args:
filename (str): if None, return string; else, write to file
tostring (bool): if not writing to file, whether to write to string (alternative is sanitized dictionary)
indent (int): if writing to file, how many indents to use per nested level
verbose (bool): detail to print
kwargs (dict): passed to savejson()
Returns:
A unicode string containing a JSON representation of the results,
or writes the JSON file to disk
**Examples**::
json = exp.to_json()
exp.to_json('results.json')
'''
d = self.summarize()
if filename is None:
output = sc.jsonify(d, tostring=tostring, indent=indent, verbose=verbose, **kwargs)
else:
output = sc.savejson(filename=filename, obj=d, indent=indent, **kwargs)
return output
[docs]
def plot(self, do_show=None, do_save=None, filename='fp_experiment.png', axis_args=None, do_maximize=True):
''' Plot the model against the data '''
data = self.data
sim = self.model
# Set up keys structure and remove non-plotted keys
keys = ['rates'] + list(data.keys())
rate_keys = ['maternal_mortality_ratio',
'infant_mortality_rate',
'crude_death_rate',
'crude_birth_rate']
non_calibrated_keys = ['pop_years', 'mcpr_years', 'tfr_years', 'asfr_bins']
for key in rate_keys + non_calibrated_keys:
if key in keys:
keys.remove(key)
nkeys = len(keys)
expected = 11
if nkeys != expected:
errormsg = f'Number of keys changed -- expected {expected}, actually {nkeys} -- did you use run_model() instead of run()?'
raise ValueError(errormsg)
with fpo.with_style():
fig, axs = pl.subplots(nrows=4, ncols=3)
pl.subplots_adjust(**sc.mergedicts(dict(bottom=0.05, top=0.97, left=0.05, right=0.97, wspace=0.3, hspace=0.3), axis_args))
#%% Do the plotting!
# Rates
ax = axs[0,0]
height = 0.4
n_rates = len(rate_keys)
y = np.arange(n_rates)
data_rates = np.array([data[k] for k in rate_keys])
sim_rates = np.array([sim[k] for k in rate_keys])
ax.barh(y=y+height/2, width=data_rates, height=height, align='center', label='Data')
ax.barh(y=y-height/2, width=sim_rates, height=height, align='center', label='Sim')
ax.set_title('Rates')
ax.set_xlabel('Rate')
ax.set_yticks(range(n_rates))
ax.set_yticklabels(rate_keys)
ax.legend()
# Population size
ax = axs[1,0]
ax.plot(data.pop_years, data.pop_size, 'o', label='Data')
ax.plot(sim.pop_years, sim.pop_size, '-', label='Sim')
ax.set_title('Population size')
ax.set_xlabel('Year')
ax.set_ylabel('Population size')
ax.legend()
# Population growth rate
ax = axs[2,0]
ax.plot(data.pop_years[:-1], data.pop_growth_rate, 'o', label='Data')
ax.plot(sim.pop_years[:-1], sim.pop_growth_rate, '-', label='Sim')
ax.set_title('Population growth rate')
ax.set_xlabel('Year')
ax.set_ylabel('Population growth rate')
ax.legend()
# MCPR
ax = axs[3,0]
ax.plot(data.mcpr_years, data.mcpr, 'o', label='Data')
ax.plot(sim.mcpr_years, sim.mcpr, '-', label='Sim')
ax.set_title('MCPR')
ax.set_xlabel('Year')
ax.set_ylabel('Modern contraceptive prevalence rate')
ax.legend()
# Data age-parity
ax = axs[0,1]
ax.pcolormesh(self.age_bins, self.parity_bins, data.ageparity.transpose(), shading='nearest', cmap='turbo')
ax.set_aspect(1./ax.get_data_ratio()) # Make square
ax.set_title('Age-parity plot: data')
ax.set_xlabel('Age')
ax.set_ylabel('Parity')
# Sim age-parity
ax = axs[1,1]
ax.pcolormesh(self.age_bins, self.parity_bins, sim.ageparity.transpose(), shading='nearest', cmap='turbo')
ax.set_aspect(1./ax.get_data_ratio())
ax.set_title('Age-parity plot: sim')
ax.set_xlabel('Age')
ax.set_ylabel('Parity')
# Spacing bins
ax = axs[2, 1]
height = 0.4
spacing_bins = sc.odict({'0-12': 0, '12-24': 1, '24-48': 2, '>48': 4}) # Spacing bins in years
n_bins = len(spacing_bins.keys())
y = np.arange(len(data.spacing_bins))
ax.barh(y=y+height/2, width=data.spacing_bins, height=height, align='center', label='Data')
ax.barh(y=y-height/2, width=sim.spacing_bins, height=height, align='center', label='Sim')
ax.set_title('Birth spacing bins')
ax.set_xlabel('Percent of births in each bin')
ax.set_yticks(range(n_bins))
ax.set_yticklabels(spacing_bins.keys())
ax.set_ylabel('Birth space in months')
ax.legend()
# Age first stats
quartile_keys = ['25th %',
'Median',
'75th %']
n_quartiles = len(quartile_keys)
ax = axs[3,1]
height = 0.4
y = np.arange(len(data.age_first_stats))
ax.barh(y=y+height/2, width=data.age_first_stats, height=height, align='center', label='Data')
ax.barh(y=y-height/2, width=sim.age_first_stats, height=height, align='center', label='Sim')
ax.set_title('Age at first birth')
ax.set_xlabel('Age')
ax.set_yticks(range(n_quartiles))
ax.set_yticklabels(quartile_keys)
ax.legend()
# Method counts
ax = axs[2,2]
height = 0.4
y = np.arange(len(data.method_counts))
y1 = y + height/2
y2 = y - height/2
ax.barh(y=y1, width=data.method_counts, height=height, align='center', label='Data')
ax.barh(y=y2, width=sim.method_counts, height=height, align='center', label='Sim')
ax.set_yticks(y, self.method_keys)
ax.set_title('Method counts')
ax.set_ylabel('Contraceptive method')
ax.set_xlabel('Rate of use')
ax.legend()
# ASFR
ax = axs[3,2]
y = np.arange(len(data.asfr))
y1 = y + height/2
y2 = y - height/2
ax.barh(y=y1, width=data.asfr, height=height, align='center', label='Data')
ax.barh(y=y2, width=sim.asfr, height=height, align='center', label='Sim')
ax.set_yticks(y, sim.asfr_bins)
ax.set_title('Age-specific fertility rate')
ax.set_ylabel('Age bin')
ax.set_xlabel('Fertility rate')
ax.legend()
# Tidy up
if do_maximize:
sc.maximize(fig=fig)
return fps.tidy_up(fig=fig, do_show=do_show, do_save=do_save, filename=filename)
[docs]
class Fit(sc.prettyobj):
'''
A class for calculating the fit between the model and the data. Note the
following terminology is used here:
- fit: nonspecific term for how well the model matches the data
- difference: the absolute numerical differences between the model and the data (one time series per result)
- goodness-of-fit: the result of passing the difference through a statistical function, such as mean squared error
- loss: the goodness-of-fit for each result multiplied by user-specified weights (one time series per result)
- mismatches: the sum of all the losses (a single scalar value per time series)
- mismatch: the sum of the mismatches -- this is the value to be minimized during calibration
Args:
sim (Sim): the sim object
weights (dict): the relative weight to place on each result (by default: 10 for deaths, 5 for diagnoses, 1 for everything else)
keys (list): the keys to use in the calculation
custom (dict): a custom dictionary of additional data to fit; format is e.g. {'my_output':{'data':[1,2,3], 'sim':[1,2,4], 'weights':2.0}}
compute (bool): whether to compute the mismatch immediately
verbose (bool): detail to print
kwargs (dict): passed to cv.compute_gof() -- see this function for more detail on goodness-of-fit calculation options
**Example**::
sim = cv.Sim()
sim.run()
fit = sim.compute_fit()
fit.plot()
'''
def __init__(self, data, sim, weights=None, keys=None, custom=None, compute=True, verbose=False, **kwargs):
# Handle inputs
self.custom = sc.mergedicts(custom)
self.verbose = verbose
self.weights = sc.mergedicts(weights)
self.gof_kwargs = kwargs
# Copy data
self.data = data
self.sim_results = sim
# Remove keys that aren't for fitting
for key in self.data.keys():
if key.endswith('_years') or key.endswith('_bins'):
self.data.pop(key)
self.sim_results.pop(key)
self.keys = data.keys()
# These are populated during initialization
self.inds = sc.objdict() # To store matching indices between the data and the simulation
self.inds.sim = sc.objdict() # For storing matching indices in the sim
self.inds.data = sc.objdict() # For storing matching indices in the data
self.pair = sc.objdict() # For storing perfectly paired points between the data and the sim
self.diffs = sc.objdict() # Differences between pairs
self.gofs = sc.objdict() # Goodness-of-fit for differences
self.losses = sc.objdict() # Weighted goodness-of-fit
self.mismatches = sc.objdict() # Final mismatch values
self.mismatch = None # The final value
if compute:
self.compute()
return
[docs]
def compute(self):
''' Perform all required computations '''
self.reconcile_inputs() # Find matching values
self.compute_diffs() # Perform calculations
self.compute_gofs()
self.compute_losses()
self.compute_mismatch()
return self.mismatch
[docs]
def compute_diffs(self, absolute=False):
''' Find the differences between the sim and the data '''
for key in self.pair.keys():
self.diffs[key] = self.pair[key].sim - self.pair[key].data
if absolute:
self.diffs[key] = np.abs(self.diffs[key])
return
[docs]
def compute_gofs(self, **kwargs):
''' Compute the goodness-of-fit '''
kwargs = sc.mergedicts(self.gof_kwargs, kwargs)
for key in self.pair.keys():
actual = sc.dcp(self.pair[key].data)
predicted = sc.dcp(self.pair[key].sim)
self.gofs[key] = compute_gof(actual, predicted, **kwargs)
return
[docs]
def compute_losses(self):
''' Compute the weighted goodness-of-fit '''
for key in self.gofs.keys():
if key in self.weights:
weight = self.weights[key]
if sc.isiterable(weight): # It's an array
len_wt = len(weight)
len_sim = self.sim_npts
len_match = len(self.gofs[key])
if len_wt == len_match: # If the weight already is the right length, do nothing
pass
elif len_wt == len_sim: # Most typical case: it's the length of the simulation, must trim
weight = weight[self.inds.sim[key]] # Trim to matching indices
else:
errormsg = f'Could not map weight array of length {len_wt} onto simulation of length {len_sim} or data-model matches of length {len_match}'
raise ValueError(errormsg)
else:
weight = 1.0
self.losses[key] = self.gofs[key]*weight
return
[docs]
def compute_mismatch(self, use_median=False):
''' Compute the final mismatch '''
for key in self.losses.keys():
if use_median:
self.mismatches[key] = np.median(self.losses[key])
else:
self.mismatches[key] = np.sum(self.losses[key])
self.mismatch = self.mismatches[:].sum()
return self.mismatch
[docs]
def plot(self, keys=None, width=0.8, font_size=18, fig_args=None, axis_args=None, plot_args=None, do_show=True):
'''
Plot the fit of the model to the data. For each result, plot the data
and the model; the difference; and the loss (weighted difference). Also
plots the loss as a function of time.
Args:
keys (list): which keys to plot (default, all)
width (float): bar width
font_size (float): size of font
fig_args (dict): passed to pl.figure()
axis_args (dict): passed to pl.subplots_adjust()
plot_args (dict): passed to pl.plot()
do_show (bool): whether to show the plot
'''
fig_args = sc.mergedicts(dict(figsize=(36,22)), fig_args)
axis_args = sc.mergedicts(dict(left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.3, hspace=0.3), axis_args)
plot_args = sc.mergedicts(dict(lw=4, alpha=0.5, marker='o'), plot_args)
pl.rcParams['font.size'] = font_size
if keys is None:
keys = self.keys + self.custom_keys
n_keys = len(keys)
loss_ax = None
colors = sc.gridcolors(n_keys)
n_rows = 3
fig = pl.figure(**fig_args)
pl.subplots_adjust(**axis_args)
for k,key in enumerate(keys):
if key in self.keys: # It's a time series, plot with days and dates
days = self.inds.sim[key] # The "days" axis (or not, for custom keys)
daylabel = 'Timestep'
else: #It's custom, we don't know what it is
days = np.arange(len(self.losses[key])) # Just use indices
daylabel = 'Index'
pl.subplot(n_rows, n_keys, k+0*n_keys+1)
pl.plot(days, self.pair[key].data, c='k', label='Data', **plot_args)
pl.plot(days, self.pair[key].sim, c=colors[k], label='Simulation', **plot_args)
pl.title(key)
if k == 0:
pl.ylabel('Time series (counts)')
pl.legend()
pl.subplot(n_rows, n_keys, k+1*n_keys+1)
pl.bar(days, self.diffs[key], width=width, color=colors[k], label='Difference')
pl.axhline(0, c='k')
if k == 0:
pl.ylabel('Differences (counts)')
pl.legend()
loss_ax = pl.subplot(n_rows, n_keys, k+2*n_keys+1, sharey=loss_ax)
pl.bar(days, self.losses[key], width=width, color=colors[k], label='Losses')
pl.xlabel(daylabel)
pl.title(f'Total loss: {self.losses[key].sum():0.3f}')
if k == 0:
pl.ylabel('Losses')
pl.legend()
if do_show:
pl.show()
return fig
[docs]
def compute_gof(actual, predicted, normalize=True, use_frac=False, use_squared=False, as_scalar='none', eps=1e-9, skestimator=None, **kwargs):
'''
Calculate the goodness of fit. By default use normalized absolute error, but
highly customizable. For example, mean squared error is equivalent to
setting normalize=False, use_squared=True, as_scalar='mean'.
Args:
actual (arr): array of actual (data) points
predicted (arr): corresponding array of predicted (model) points
normalize (bool): whether to divide the values by the largest value in either series
use_frac (bool): convert to fractional mismatches rather than absolute
use_squared (bool): square the mismatches
as_scalar (str): return as a scalar instead of a time series: choices are sum, mean, median
eps (float): to avoid divide-by-zero
skestimator (str): if provided, use this scikit-learn estimator instead
kwargs (dict): passed to the scikit-learn estimator
Returns:
gofs (arr): array of goodness-of-fit values, or a single value if as_scalar is True
**Examples**::
x1 = np.cumsum(np.random.random(100))
x2 = np.cumsum(np.random.random(100))
e1 = compute_gof(x1, x2) # Default, normalized absolute error
e2 = compute_gof(x1, x2, normalize=False, use_frac=False) # Fractional error
e3 = compute_gof(x1, x2, normalize=False, use_squared=True, as_scalar='mean') # Mean squared error
e4 = compute_gof(x1, x2, skestimator='mean_squared_error') # Scikit-learn's MSE method
e5 = compute_gof(x1, x2, as_scalar='median') # Normalized median absolute error -- highly robust
'''
# Handle inputs
actual = np.array(sc.dcp(actual), dtype=float)
predicted = np.array(sc.dcp(predicted), dtype=float)
# Custom estimator is supplied: use that
if skestimator is not None: # pragma: nocover
try:
import sklearn.metrics as sm
sklearn_gof = getattr(sm, skestimator) # Shortcut to e.g. sklearn.metrics.max_error
except ImportError as E:
raise ImportError(f'You must have scikit-learn >=0.22.2 installed: {str(E)}')
except AttributeError:
raise AttributeError(f'Estimator {skestimator} is not available; see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter for options')
gof = sklearn_gof(actual, predicted, **kwargs)
return gof
# Default case: calculate it manually
else:
# Key step -- calculate the mismatch!
gofs = abs(np.array(actual) - np.array(predicted))
if normalize and not use_frac:
actual_max = abs(actual).max()
if actual_max>0:
gofs /= actual_max
if use_frac:
if (actual<0).any() or (predicted<0).any():
print('Warning: Calculating fractional errors for non-positive quantities is ill-advised!')
else:
maxvals = np.maximum(actual, predicted) + eps
gofs /= maxvals
if use_squared:
gofs = gofs**2
if as_scalar == 'sum':
gofs = np.sum(gofs)
elif as_scalar == 'mean':
gofs = np.mean(gofs)
elif as_scalar == 'median':
gofs = np.median(gofs)
return gofs
[docs]
def diff_summaries(sim1, sim2, skip_key_diffs=False, output=False, die=False):
'''
Compute the difference of the summaries of two FPsim calibration objects, and print any
values which differ.
Args:
sim1 (sim/dict): the calib.summary dictionary, representing a single sim
sim2 (sim/dict): ditto
skip_key_diffs (bool): whether to skip keys that don't match between sims
output (bool): whether to return the output as a string (otherwise print)
die (bool): whether to raise an exception if the sims don't match
require_run (bool): require that the simulations have been run
**Example**::
c1 = fp.Calibration()
c2 = fp.Calibration()
c1.run()
c2.run()
fp.diff_summaries(c1.summarize(), c2.summarize())
'''
for sim in [sim1, sim2]:
if not isinstance(sim, dict): # pragma: no cover
errormsg = f'Cannot compare object of type {type(sim)}, must be a FPsim calib.summary dict'
raise TypeError(errormsg)
# Ignore data for now
sim1 = sim1['model']
sim2 = sim2['model']
# Compare keys
keymatchmsg = ''
sim1_keys = set(sim1.keys())
sim2_keys = set(sim2.keys())
#if sim1_keys != _keys and not skip_key_diffs: # pragma: no cover
#keymatchmsg = "Keys don't match!\n"
# missing = list(sim1_keys - sim2_keys)
#extra = list(sim2_keys - sim1_keys)
#if missing:
#keymatchmsg += f' Missing sim1 keys: {missing}\n'
#if extra:
#keymatchmsg += f' Extra sim2 keys: {extra}\n'
# Compare values
valmatchmsg = ''
mismatches = {}
for key in sim2.keys(): # To ensure order
if key in sim1_keys: # If a key is missing, don't count it as a mismatch
sim1_val = sim1[key] if key in sim1 else 'not present'
sim2_val = sim2[key] if key in sim2 else 'not present'
both_nan = sc.isnumber(sim1_val, isnan=True) and sc.isnumber(sim2_val, isnan=True)
if sim1_val != sim2_val and not both_nan:
mismatches[key] = {'sim1': sim1_val, 'sim2': sim2_val}
if len(mismatches): # pragma: nocover
valmatchmsg = '\nThe following values differ between the two simulations:\n'
df = pd.DataFrame.from_dict(mismatches).transpose()
diff = []
ratio = []
change = []
small_change = 1e-3 # Define a small change, e.g. a rounding error
for mdict in mismatches.values():
old = mdict['sim1']
new = mdict['sim2']
numeric = sc.isnumber(sim1_val) and sc.isnumber(sim2_val)
if numeric and old>0:
this_diff = new - old
this_ratio = new/old
abs_ratio = max(this_ratio, sc.safedivide(1.0, this_ratio, np.inf))
# Set the character to use
if abs_ratio<small_change:
change_char = '≈'
elif new > old:
change_char = '↑'
elif new < old:
change_char = '↓'
else:
errormsg = f'Could not determine relationship between sim1={old} and sim2={new}'
raise ValueError(errormsg)
# Set how many repeats it should have
repeats = 1
if abs_ratio >= 1.1:
repeats = 2
if abs_ratio >= 2:
repeats = 3
if abs_ratio >= 10:
repeats = 4
this_change = change_char*repeats
else: # pragma: no cover
this_diff = np.nan
this_ratio = np.nan
this_change = 'N/A'
diff.append(this_diff)
ratio.append(this_ratio)
change.append(this_change)
df['diff'] = diff
df['ratio'] = ratio
for col in ['sim1', 'sim2', 'diff', 'ratio']:
df[col] = df[col].round(decimals=3)
df['change'] = change
valmatchmsg += str(df)
# Raise an error if mismatches were found
mismatchmsg = keymatchmsg + valmatchmsg
if mismatchmsg: # pragma: no cover
if die:
raise ValueError(mismatchmsg)
elif output:
return mismatchmsg
else:
print(mismatchmsg)
else:
if not output:
print('Sims match')
return