Source code for fpsim.experiment

'''
Define classes and functions for the Experiment class (running sims and comparing them to data)
'''


import yaml
import numpy as np
import pylab as pl
import pandas as pd
import sciris as sc
from .settings import options as fpo
from . import defaults as fpd
from . import parameters as fpp
from . import sim as fps


__all__ = ['Experiment', 'Fit', 'compute_gof', 'diff_summaries']

# ...more settings
min_age = 15
max_age = 50
bin_size = 5
first_birth_age = 25  # age to start assessing first birth age in model
mpy = 12  # Months per year

# Flags for what to run
default_flags = sc.objdict(
    popsize       = 1, # Population size and growth over time on whole years, adjusted for n number of agents; 'pop_size'
    ageparity   = 1, # Population distribution of agents in each age/parity bin (age-parity plot); 'ageparity'
    first_birth   = 1, # Age at first birth mean with standard deviation; 'age_first_birth'
    birth_space   = 1, # Birth spacing both in bins and mean with standard deviation; 'spacing'
    mcpr          = 1, # Modern contraceptive prevalence; 'mcpr'
    methods       = 1, # Overall percentage of method use and method use among users; 'methods'
    mmr           = 1, # Maternal mortality ratio at end of sim in model vs data; 'maternal_mortality_ratio'
    infant_m      = 1, # Infant mortality rate at end of sim in model vs data; 'infant_mortality_rate'
    cdr           = 1, # Crude death rate at end of sim in model vs data; 'crude_death_rate'
    cbr           = 1, # Crude birth rate (per 1000 inhabitants); 'crude_birth_rate'
    tfr           = 1, # Total fertility rate
    asfr          = 1, # Age-specific fertility rate
)


[docs] class Experiment(sc.prettyobj): ''' Class for running calibration to data. Effectively, it runs a single sim and compares it to data. Args: pars (dict): dictionary of parameters flags (dict): which analyses to run; see ``fp.experiment.default_flags`` for options label (str): label of experiment kwargs (dict): passed into pars ''' def __init__(self, pars=None, flags=None, label=None, **kwargs): self.flags = sc.mergedicts(default_flags, flags, _copy=True) # Set flags for what gets run self.pars = pars if pars else fpp.pars(**kwargs) self.model = sc.objdict() self.data = sc.objdict() self.method_keys = None self.initialized = False self.label = label return
[docs] def load_data(self, key, **kwargs): ''' Load data from various formats ''' files = self.pars['filenames'] path = files['base'] / files[key] if path.suffix == '.obj': data = sc.load(path, **kwargs) elif path.suffix == '.json': data = sc.loadjson(path, **kwargs) elif path.suffix == '.csv': data = pd.read_csv(path, **kwargs) elif path.suffix == '.yaml': with open(path) as f: data = yaml.safe_load(f, **kwargs) else: errormsg = f'Unrecognized file format for: {path}' raise ValueError(errormsg) return data
[docs] def extract_data(self): ''' Load data ''' json = self.load_data('basic_dhs') self.data.update(json) #self.data['pregnancy_parity'] = self.load_data('pregnancy_parity') # Extract population size over time if self.pars: n = self.pars['n_agents'] else: n = 1000 # Use default if not available print(f'Warning: parameters not defined, using default of n={n}') pop_size = self.load_data('popsize') self.data['pop_years'] = pop_size.year.to_numpy() self.data['pop_size'] = pop_size.population.to_numpy() / (pop_size.population[0] / n) # Corrected for # of agents, needs manual adjustment for # agents # Extract population growth rate data_growth_rate = self.pop_growth_rate(self.data['pop_years'], self.data['pop_size']) self.data['pop_growth_rate'] = data_growth_rate # Extract mcpr over time mcpr = self.load_data('mcpr') self.data['mcpr_years'] = mcpr.iloc[:,0].to_numpy() #self.data['cpr'] = mcpr.iloc[:,1].to_numpy() self.data['mcpr'] = mcpr.iloc[:,2].to_numpy() self.initialized = True return
def pop_growth_rate(self, years, population): growth_rate = np.zeros(len(years) - 1) for i in range(len(years)): if population[i] == population[-1]: break growth_rate[i] = ((population[i + 1] - population[i]) / population[i]) * 100 return growth_rate
[docs] def run_model(self, pars=None, **kwargs): ''' Create the sim and run the model ''' if not self.initialized: self.extract_data() if pars is None: pars = self.pars self.sim = fps.Sim(pars=pars, **kwargs) self.sim.run() self.post_process_sim() return
def post_process_sim(self): self.people = self.sim.people # Extract people objects from sim self.model_results = self.sim.results # Stores dictionary of results self.method_keys = list(self.sim['methods']['map'].keys()) return def extract_model(self): if self.flags.popsize: self.model_pop_size() if self.flags.mcpr: self.model_mcpr() if self.flags.mmr: self.model_mmr() if self.flags.infant_m: self.model_infant_mortality_rate() if self.flags.cdr: self.model_crude_death_rate() if self.flags.cbr: self.model_crude_birth_rate() if self.flags.tfr: self.model_data_tfr() if self.flags.asfr: self.model_data_asfr() return def model_pop_size(self): self.model['pop_size'] = self.model_results['pop_size'] self.model['pop_years'] = self.model_results['tfr_years'] model_growth_rate = self.pop_growth_rate(self.model['pop_years'], self.model['pop_size']) self.model['pop_growth_rate'] = model_growth_rate return def model_mcpr(self): model = {'years': self.model_results['t'], 'mcpr': self.model_results['mcpr']} model_frame = pd.DataFrame(model) # Filter to matching years data_years = self.data['mcpr_years'].tolist() filtered_model = model_frame.loc[model_frame.years.isin(data_years)] model_mcpr = filtered_model['mcpr'].to_numpy() mcpr_years = filtered_model['years'].to_numpy() self.model['mcpr'] = model_mcpr*100 # Since data is in 100 self.model['mcpr_years'] = mcpr_years return
[docs] def model_mmr(self): ''' Calculate maternal mortality in model over most recent 3 years ''' maternal_deaths = np.sum(self.model_results['maternal_deaths'][-mpy * 3:]) births_last_3_years = np.sum(self.model_results['births'][-mpy * 3:]) self.model['maternal_mortality_ratio'] = (maternal_deaths / births_last_3_years) * 100000 return
def model_infant_mortality_rate(self): infant_deaths = np.sum(self.model_results['infant_deaths'][-mpy:]) births_last_year = np.sum(self.model_results['births'][-mpy:]) self.model['infant_mortality_rate'] = (infant_deaths / births_last_year) * 1000 return def model_crude_death_rate(self): total_deaths = np.sum(self.model_results['deaths'][-mpy:]) + \ np.sum(self.model_results['infant_deaths'][-mpy:]) + \ np.sum(self.model_results['maternal_deaths'][-mpy:]) self.model['crude_death_rate'] = (total_deaths / self.model_results['pop_size'][-1]) * 1000 return def model_crude_birth_rate(self): births_last_year = np.sum(self.model_results['births'][-mpy:]) self.model['crude_birth_rate'] = (births_last_year / self.model_results['pop_size'][-1]) * 1000 return def model_data_tfr(self): # Extract tfr over time in data - keep here to ignore dhs data if not using tfr for calibration tfr = self.load_data('tfr') # From DHS self.data['tfr_years'] = tfr.iloc[:, 0].to_numpy() self.data['total_fertility_rate'] = tfr.iloc[:, 1].to_numpy() self.model['tfr_years'] = self.model_results['tfr_years'] self.model['total_fertility_rate'] = self.model_results['tfr_rates'] return def model_data_asfr(self, ind=-1): # Extract ASFR for different age bins asfr = self.load_data('asfr') # From DHS age_bins = list(asfr.columns) age_bins.remove('year') self.data['asfr_bins'] = age_bins year_data = asfr[asfr['year'] == self.pars['end_year']] self.data['asfr'] = year_data.drop(['year'], axis=1).values.tolist()[0] # Model extraction age_bins = list(fpd.age_bin_map.keys()) self.model['asfr_bins'] = age_bins self.model['asfr'] = [] for ab in age_bins: val = self.model_results['asfr'][ab][ind] # Only use one index (default: last) CK: TODO: match year automatically self.model['asfr'].append(val) # Check assert self.data['asfr_bins'] == self.model['asfr_bins'], f'ASFR data age bins do not match sim: {sc.strjoin(age_bins)}' return def extract_ageparity(self): # Set up age_keys = list(fpd.age_bin_map.keys())[1:] age_bins = pl.arange(min_age, max_age, bin_size) parity_bins = pl.arange(0, 7) # Plot up to parity 6 n_age = len(age_bins) n_parity = len(parity_bins) # Load data TO NOTE: By default, the dataset that is used for comparison with the model is the last dataset ( # typically the most recent) in the ageparity file sky_raw_data = self.load_data('ageparity') dataset = sky_raw_data.iloc[-1]['dataset'] sky_raw_data = sky_raw_data[sky_raw_data.dataset == dataset] # sky_parity = sky_raw_data[2].to_numpy() # Not used currently sky_parity = sky_raw_data['parity'].to_numpy() sky_props = sky_raw_data['percentage'].to_numpy() sky_arr = sc.odict() sky_arr['Data'] = pl.zeros((len(age_keys), len(parity_bins))) for age, row in sky_raw_data.iterrows(): if row.age in age_keys and row.parity < n_parity: age_ind = age_keys.index(row.age) sky_arr['Data'][age_ind, row.parity] = row.percentage # Extract from model sky_arr['Model'] = pl.zeros((len(age_bins), len(parity_bins))) ppl = self.people for i in range(len(ppl)): if ppl.alive[i] and not ppl.sex[i] and ppl.age[i] >= min_age and ppl.age[i] < max_age: age_bin = sc.findinds(age_bins <= ppl.age[i])[-1] parity_bin = sc.findinds(parity_bins <= ppl.parity[i])[-1] sky_arr['Model'][age_bin, parity_bin] += 1 # Normalize for key in ['Data', 'Model']: sky_arr[key] /= sky_arr[key].sum() / 100 self.data['ageparity'] = sky_arr['Data'] self.model['ageparity'] = sky_arr['Model'] self.age_bins = age_bins self.parity_bins = parity_bins return def extract_birth_spacing(self): # Set up data_afb = self.load_data('afb') data_afb = data_afb.sort_values(by='afb') data_spaces = self.load_data('spacing') data_spaces = data_spaces.sort_values(by='space_mo') spacing_bins = sc.odict({'0-12': 0, '12-24': 1, '24-48': 2, '>48': 4}) # Spacing bins in years model_age_first = [] model_spacing = [] model_spacing_counts = sc.odict().make(keys=spacing_bins.keys(), vals=0.0) data_spacing_counts = sc.odict().make(keys=spacing_bins.keys(), vals=0.0) ppl = self.people # Extract age at first birth and birth spaces from model for i in range(len(ppl)): if ppl.alive[i] and not ppl.sex[i] and min_age <= ppl.age[i] < max_age: if len(ppl.dobs[i]): model_age_first.append(ppl.dobs[i][0]) if len(ppl.dobs[i]) > 1: for d in range(len(ppl.dobs[i]) - 1): space = ppl.dobs[i][d + 1] - ppl.dobs[i][d] ind = sc.findinds(space > spacing_bins[:])[-1] model_spacing_counts[ind] += 1 model_spacing.append(space) # Normalize model birth space bin counts to percentages model_spacing_counts[:] /= model_spacing_counts[:].sum() model_spacing_counts[:] *= 100 # Extract birth spaces and age at first birth from data for i, j in data_spaces.iterrows(): space = j['space_mo'] / mpy ind = sc.findinds(space > spacing_bins[:])[-1] data_spacing_counts[ind] += j['Freq'] # Normalize dat birth space bin counts to percentages data_spacing_counts[:] /= data_spacing_counts[:].sum() data_spacing_counts[:] *= 100 # Extract afb and respective weights data afb_values = data_afb["afb"].values.tolist() afb_weights = data_afb["wt"].values.tolist() # Calculate the cumulative weights and total weight of the afb data afb_cum_weights = np.cumsum(afb_weights) afb_total_weight = afb_cum_weights[-1] # Extract birth spacing and respective frequency data birth_spacing_values = data_spaces["space_mo"].values.tolist() birth_spacing_weights = data_spaces["Freq"].values.tolist() # Calculate the cumulative weights and total weight of the birth spacing data birth_spacing_cum_weights = np.cumsum(birth_spacing_weights) birth_spacing_total_weight = birth_spacing_cum_weights[-1] data_spacing_stats = np.array([np.interp((.25 * afb_total_weight), afb_cum_weights, afb_values), np.interp((.50 * afb_total_weight), afb_cum_weights, afb_values), np.interp((.75 * afb_total_weight), afb_cum_weights, afb_values)]) data_age_first_stats = np.array([np.interp((.25 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values), np.interp((.50 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values), np.interp((.75 * birth_spacing_total_weight), birth_spacing_cum_weights, birth_spacing_values)]) # Save to dictionary self.data['spacing_bins'] = np.array(data_spacing_counts.values()) self.data['spacing_stats'] = data_spacing_stats self.data['age_first_stats'] = data_age_first_stats try: model_spacing_stats = np.array([np.percentile(model_spacing, 25), np.percentile(model_spacing, 50), np.percentile(model_spacing, 75)]) model_age_first_stats = np.array([np.percentile(model_age_first, 25), np.percentile(model_age_first, 50), np.percentile(model_age_first, 75)]) except Exception as E: # pragma: nocover print(f'Could not calculate birth spacing, returning zeros: {E}') model_spacing_counts = {k: 0 for k in spacing_bins.keys()} model_spacing_stats = np.zeros(data_spacing_stats.shape) model_age_first_stats = np.zeros(data_age_first_stats.shape) # Save arrays to dictionary self.model['spacing_bins'] = np.array(model_spacing_counts.values()) self.model['spacing_stats'] = model_spacing_stats self.model['age_first_stats'] = model_age_first_stats return def extract_methods(self): data_method_counts = sc.odict().make(self.method_keys, vals=0.0) model_method_counts = sc.dcp(data_method_counts) # Extract from data data_methods = self.load_data('methods') for index, row in data_methods.iterrows(): data_method_counts[row['method']] = row['perc'] # Update data method mix using non-user percentage from 'use' file data_use = self.load_data('use') data_method_counts['None'] = data_use.loc[0, 'perc'] use_freq = (data_use.loc[1, 'perc'])/100 for key, value in data_method_counts.items(): value /= 100 if key != 'None': value *= use_freq data_method_counts.update({key: value}) # Extract from model ppl = self.people for i in range(len(ppl)): if ppl.alive[i] and not ppl.sex[i] and ppl.age[i] >= min_age and ppl.age[i] < max_age: model_method_counts[ppl.method[i]] += 1 model_method_counts[:] /= model_method_counts[:].sum() # Make labels data_labels = data_method_counts.keys() for d in range(len(data_labels)): if data_method_counts[d] > 0.01: data_labels[d] = f'{data_labels[d]}: {data_method_counts[d] * 100:0.1f}%' else: data_labels[d] = '' model_labels = model_method_counts.keys() for d in range(len(model_labels)): if model_method_counts[d] > 0.01: model_labels[d] = f'{model_labels[d]}: {model_method_counts[d] * 100:0.1f}%' else: model_labels[d] = '' self.data['method_counts'] = np.array(data_method_counts.values()) self.model['method_counts'] = np.array(model_method_counts.values()) return
[docs] def compute_fit(self, *args, **kwargs): ''' Compute how good the fit is ''' data = sc.dcp(self.data) try: sim = sc.dcp(self.model, die=False) # Sometimes fails with a dict_keys copy error (!) except: sim = {k:self.model[k] for k in data.keys()} for k in data.keys(): data[k] = sc.promotetoarray(data[k]) data[k] = data[k].flatten() sim[k] = sc.promotetoarray(sim[k]) sim[k] = sim[k].flatten() self.fit = Fit(data, sim, *args, **kwargs) pass
[docs] def post_process_results(self, keep_people=False, compute_fit=True, **kwargs): ''' Compare the model and the data ''' self.extract_model() if self.flags.ageparity: self.extract_ageparity() if self.flags.birth_space: self.extract_birth_spacing() if self.flags.methods: self.extract_methods() # Remove people, they're large! if not keep_people: del self.people # Compute comparison self.df = self.compare() # Compute fit if compute_fit: self.compute_fit(**kwargs) return
[docs] def run(self, pars=None, keep_people=False, compute_fit=True, **kwargs): ''' Run the model and post-process the results ''' self.run_model(pars=pars) self.post_process_results(keep_people=keep_people, compute_fit=compute_fit, **kwargs) return self
[docs] def compare(self): ''' Create and print a comparison between model and data ''' # Check that keys match data_keys = self.data.keys() model_keys = self.model.keys() assert set(data_keys) == set(model_keys), 'Data and model keys do not match' # Compare the two comparison = [] for key in data_keys: dv = self.data[key] # dv = "Data value" mv = self.model[key] # mv = "Model value" cmp = sc.objdict(key=key, d_type=type(dv), m_type=type(mv), d_shape=np.shape(dv), m_shape=np.shape(mv), d_val='array', m_val='array') if sc.isnumber(dv): cmp.d_val = dv if sc.isnumber(mv): cmp.m_val = mv comparison.append(cmp) self.comparison_df = pd.DataFrame.from_dict(comparison) return self.comparison_df
[docs] def summarize(self, as_df=False): ''' Convert results to a one-number-per-key summary format. Returns summary, also saves to self.summary. Args: as_df (bool): if True, return a dataframe instead of a dict. ''' summary = sc.objdict() summary.model = sc.objdict() summary.data = sc.objdict() data = self.data model = self.model keys = model.keys() # Compare the two for key in keys: if not (key.endswith('_years') or key.endswith('_bins')): dv = data[key] # dv = "Data value" mv = model[key] # mv = "Model value" if sc.isnumber(mv) and sc.isnumber(dv): summary.data[key] = dv summary.model[key] = mv else: summary.data[key+'_mean'] = np.mean(dv) summary.model[key+'_mean'] = np.mean(mv) self.summary = summary self.summary_df = pd.DataFrame(summary) if as_df: return self.summary.df else: return self.summary
[docs] def to_json(self, filename=None, tostring=False, indent=2, verbose=False, **kwargs): ''' Export results as JSON. Args: filename (str): if None, return string; else, write to file tostring (bool): if not writing to file, whether to write to string (alternative is sanitized dictionary) indent (int): if writing to file, how many indents to use per nested level verbose (bool): detail to print kwargs (dict): passed to savejson() Returns: A unicode string containing a JSON representation of the results, or writes the JSON file to disk **Examples**:: json = exp.to_json() exp.to_json('results.json') ''' d = self.summarize() if filename is None: output = sc.jsonify(d, tostring=tostring, indent=indent, verbose=verbose, **kwargs) else: output = sc.savejson(filename=filename, obj=d, indent=indent, **kwargs) return output
[docs] def plot(self, do_show=None, do_save=None, filename='fp_experiment.png', axis_args=None, do_maximize=True): ''' Plot the model against the data ''' data = self.data sim = self.model # Set up keys structure and remove non-plotted keys keys = ['rates'] + list(data.keys()) rate_keys = ['maternal_mortality_ratio', 'infant_mortality_rate', 'crude_death_rate', 'crude_birth_rate'] non_calibrated_keys = ['pop_years', 'mcpr_years', 'tfr_years', 'asfr_bins'] for key in rate_keys + non_calibrated_keys: if key in keys: keys.remove(key) nkeys = len(keys) expected = 11 if nkeys != expected: errormsg = f'Number of keys changed -- expected {expected}, actually {nkeys} -- did you use run_model() instead of run()?' raise ValueError(errormsg) with fpo.with_style(): fig, axs = pl.subplots(nrows=4, ncols=3) pl.subplots_adjust(**sc.mergedicts(dict(bottom=0.05, top=0.97, left=0.05, right=0.97, wspace=0.3, hspace=0.3), axis_args)) #%% Do the plotting! # Rates ax = axs[0,0] height = 0.4 n_rates = len(rate_keys) y = np.arange(n_rates) data_rates = np.array([data[k] for k in rate_keys]) sim_rates = np.array([sim[k] for k in rate_keys]) ax.barh(y=y+height/2, width=data_rates, height=height, align='center', label='Data') ax.barh(y=y-height/2, width=sim_rates, height=height, align='center', label='Sim') ax.set_title('Rates') ax.set_xlabel('Rate') ax.set_yticks(range(n_rates)) ax.set_yticklabels(rate_keys) ax.legend() # Population size ax = axs[1,0] ax.plot(data.pop_years, data.pop_size, 'o', label='Data') ax.plot(sim.pop_years, sim.pop_size, '-', label='Sim') ax.set_title('Population size') ax.set_xlabel('Year') ax.set_ylabel('Population size') ax.legend() # Population growth rate ax = axs[2,0] ax.plot(data.pop_years[:-1], data.pop_growth_rate, 'o', label='Data') ax.plot(sim.pop_years[:-1], sim.pop_growth_rate, '-', label='Sim') ax.set_title('Population growth rate') ax.set_xlabel('Year') ax.set_ylabel('Population growth rate') ax.legend() # MCPR ax = axs[3,0] ax.plot(data.mcpr_years, data.mcpr, 'o', label='Data') ax.plot(sim.mcpr_years, sim.mcpr, '-', label='Sim') ax.set_title('MCPR') ax.set_xlabel('Year') ax.set_ylabel('Modern contraceptive prevalence rate') ax.legend() # Data age-parity ax = axs[0,1] ax.pcolormesh(self.age_bins, self.parity_bins, data.ageparity.transpose(), shading='nearest', cmap='turbo') ax.set_aspect(1./ax.get_data_ratio()) # Make square ax.set_title('Age-parity plot: data') ax.set_xlabel('Age') ax.set_ylabel('Parity') # Sim age-parity ax = axs[1,1] ax.pcolormesh(self.age_bins, self.parity_bins, sim.ageparity.transpose(), shading='nearest', cmap='turbo') ax.set_aspect(1./ax.get_data_ratio()) ax.set_title('Age-parity plot: sim') ax.set_xlabel('Age') ax.set_ylabel('Parity') # Spacing bins ax = axs[2, 1] height = 0.4 spacing_bins = sc.odict({'0-12': 0, '12-24': 1, '24-48': 2, '>48': 4}) # Spacing bins in years n_bins = len(spacing_bins.keys()) y = np.arange(len(data.spacing_bins)) ax.barh(y=y+height/2, width=data.spacing_bins, height=height, align='center', label='Data') ax.barh(y=y-height/2, width=sim.spacing_bins, height=height, align='center', label='Sim') ax.set_title('Birth spacing bins') ax.set_xlabel('Percent of births in each bin') ax.set_yticks(range(n_bins)) ax.set_yticklabels(spacing_bins.keys()) ax.set_ylabel('Birth space in months') ax.legend() # Age first stats quartile_keys = ['25th %', 'Median', '75th %'] n_quartiles = len(quartile_keys) ax = axs[3,1] height = 0.4 y = np.arange(len(data.age_first_stats)) ax.barh(y=y+height/2, width=data.age_first_stats, height=height, align='center', label='Data') ax.barh(y=y-height/2, width=sim.age_first_stats, height=height, align='center', label='Sim') ax.set_title('Age at first birth') ax.set_xlabel('Age') ax.set_yticks(range(n_quartiles)) ax.set_yticklabels(quartile_keys) ax.legend() # Method counts ax = axs[2,2] height = 0.4 y = np.arange(len(data.method_counts)) y1 = y + height/2 y2 = y - height/2 ax.barh(y=y1, width=data.method_counts, height=height, align='center', label='Data') ax.barh(y=y2, width=sim.method_counts, height=height, align='center', label='Sim') ax.set_yticks(y, self.method_keys) ax.set_title('Method counts') ax.set_ylabel('Contraceptive method') ax.set_xlabel('Rate of use') ax.legend() # ASFR ax = axs[3,2] y = np.arange(len(data.asfr)) y1 = y + height/2 y2 = y - height/2 ax.barh(y=y1, width=data.asfr, height=height, align='center', label='Data') ax.barh(y=y2, width=sim.asfr, height=height, align='center', label='Sim') ax.set_yticks(y, sim.asfr_bins) ax.set_title('Age-specific fertility rate') ax.set_ylabel('Age bin') ax.set_xlabel('Fertility rate') ax.legend() # Tidy up if do_maximize: sc.maximize(fig=fig) return fps.tidy_up(fig=fig, do_show=do_show, do_save=do_save, filename=filename)
[docs] class Fit(sc.prettyobj): ''' A class for calculating the fit between the model and the data. Note the following terminology is used here: - fit: nonspecific term for how well the model matches the data - difference: the absolute numerical differences between the model and the data (one time series per result) - goodness-of-fit: the result of passing the difference through a statistical function, such as mean squared error - loss: the goodness-of-fit for each result multiplied by user-specified weights (one time series per result) - mismatches: the sum of all the losses (a single scalar value per time series) - mismatch: the sum of the mismatches -- this is the value to be minimized during calibration Args: sim (Sim): the sim object weights (dict): the relative weight to place on each result (by default: 10 for deaths, 5 for diagnoses, 1 for everything else) keys (list): the keys to use in the calculation custom (dict): a custom dictionary of additional data to fit; format is e.g. {'my_output':{'data':[1,2,3], 'sim':[1,2,4], 'weights':2.0}} compute (bool): whether to compute the mismatch immediately verbose (bool): detail to print kwargs (dict): passed to cv.compute_gof() -- see this function for more detail on goodness-of-fit calculation options **Example**:: sim = cv.Sim() sim.run() fit = sim.compute_fit() fit.plot() ''' def __init__(self, data, sim, weights=None, keys=None, custom=None, compute=True, verbose=False, **kwargs): # Handle inputs self.custom = sc.mergedicts(custom) self.verbose = verbose self.weights = sc.mergedicts(weights) self.gof_kwargs = kwargs # Copy data self.data = data self.sim_results = sim # Remove keys that aren't for fitting for key in self.data.keys(): if key.endswith('_years') or key.endswith('_bins'): self.data.pop(key) self.sim_results.pop(key) self.keys = data.keys() # These are populated during initialization self.inds = sc.objdict() # To store matching indices between the data and the simulation self.inds.sim = sc.objdict() # For storing matching indices in the sim self.inds.data = sc.objdict() # For storing matching indices in the data self.pair = sc.objdict() # For storing perfectly paired points between the data and the sim self.diffs = sc.objdict() # Differences between pairs self.gofs = sc.objdict() # Goodness-of-fit for differences self.losses = sc.objdict() # Weighted goodness-of-fit self.mismatches = sc.objdict() # Final mismatch values self.mismatch = None # The final value if compute: self.compute() return
[docs] def compute(self): ''' Perform all required computations ''' self.reconcile_inputs() # Find matching values self.compute_diffs() # Perform calculations self.compute_gofs() self.compute_losses() self.compute_mismatch() return self.mismatch
[docs] def reconcile_inputs(self, verbose=False): ''' Find matching keys and indices between the model and the data ''' data_cols = set(self.data.keys()) if self.keys is None: # pragma: nocover sim_keys = self.sim_results.keys() intersection = list(set(sim_keys).intersection(data_cols)) # Find keys in both the sim and data self.keys = [key for key in sim_keys if key in intersection and key.startswith('cum_')] # Only keep cumulative keys if not len(self.keys): errormsg = f'No matches found between simulation result keys ({sim_keys}) and data columns ({data_cols})' raise sc.KeyNotFoundError(errormsg) mismatches = [key for key in self.keys if key not in data_cols] if len(mismatches): # pragma: nocover mismatchstr = ', '.join(mismatches) errormsg = f'The following requested key(s) were not found in the data: {mismatchstr}' raise sc.KeyNotFoundError(errormsg) for key in self.keys: # For keys present in both the results and in the data self.inds.sim[key] = [] self.inds.data[key] = [] count = -1 for d, datum in enumerate(self.data[key]): count += 1 if np.isfinite(datum): # TODO: match dates for time series data self.inds.sim[key].append(count) self.inds.data[key].append(count) self.inds.sim[key] = np.array(self.inds.sim[key]) self.inds.data[key] = np.array(self.inds.data[key]) # Convert into paired points for key in self.keys: self.pair[key] = sc.objdict() sim_inds = self.inds.sim[key] data_inds = self.inds.data[key] n_inds = len(sim_inds) self.pair[key].sim = np.zeros(n_inds) self.pair[key].data = np.zeros(n_inds) for i in range(n_inds): try: self.pair[key].sim[i] = self.sim_results[key][sim_inds[i]] self.pair[key].data[i] = self.data[key][data_inds[i]] except Exception: if verbose: print('WARNING: exception at', key, i, len(sim_inds), len(self.pair[key].sim), len(self.sim_results[key])) # Process custom inputs self.custom_keys = list(self.custom.keys()) for key in self.custom.keys(): # pragma: nocover # Initialize and do error checking custom = self.custom[key] c_keys = list(custom.keys()) if 'sim' not in c_keys or 'data' not in c_keys: errormsg = f'Custom input must have "sim" and "data" keys, not {c_keys}' raise sc.KeyNotFoundError(errormsg) c_data = custom['data'] c_sim = custom['sim'] try: assert len(c_data) == len(c_sim) except: errormsg = f'Custom data and sim must be arrays, and be of the same length: data = {c_data}, sim = {c_sim} could not be processed' raise ValueError(errormsg) if key in self.pair: errormsg = f'You cannot use a custom key "{key}" that matches one of the existing keys: {self.pair.keys()}' raise ValueError(errormsg) # If all tests pass, simply copy the data self.pair[key] = sc.objdict() self.pair[key].sim = c_sim self.pair[key].data = c_data # Process weight, if available wt = custom.get('weight', 1.0) # Attempt to retrieve key 'weight', or use the default if not provided wt = custom.get('weights', wt) # ...but also try "weights" self.weights[key] = wt # Set the weight return
[docs] def compute_diffs(self, absolute=False): ''' Find the differences between the sim and the data ''' for key in self.pair.keys(): self.diffs[key] = self.pair[key].sim - self.pair[key].data if absolute: self.diffs[key] = np.abs(self.diffs[key]) return
[docs] def compute_gofs(self, **kwargs): ''' Compute the goodness-of-fit ''' kwargs = sc.mergedicts(self.gof_kwargs, kwargs) for key in self.pair.keys(): actual = sc.dcp(self.pair[key].data) predicted = sc.dcp(self.pair[key].sim) self.gofs[key] = compute_gof(actual, predicted, **kwargs) return
[docs] def compute_losses(self): ''' Compute the weighted goodness-of-fit ''' for key in self.gofs.keys(): if key in self.weights: weight = self.weights[key] if sc.isiterable(weight): # It's an array len_wt = len(weight) len_sim = self.sim_npts len_match = len(self.gofs[key]) if len_wt == len_match: # If the weight already is the right length, do nothing pass elif len_wt == len_sim: # Most typical case: it's the length of the simulation, must trim weight = weight[self.inds.sim[key]] # Trim to matching indices else: errormsg = f'Could not map weight array of length {len_wt} onto simulation of length {len_sim} or data-model matches of length {len_match}' raise ValueError(errormsg) else: weight = 1.0 self.losses[key] = self.gofs[key]*weight return
[docs] def compute_mismatch(self, use_median=False): ''' Compute the final mismatch ''' for key in self.losses.keys(): if use_median: self.mismatches[key] = np.median(self.losses[key]) else: self.mismatches[key] = np.sum(self.losses[key]) self.mismatch = self.mismatches[:].sum() return self.mismatch
[docs] def plot(self, keys=None, width=0.8, font_size=18, fig_args=None, axis_args=None, plot_args=None, do_show=True): ''' Plot the fit of the model to the data. For each result, plot the data and the model; the difference; and the loss (weighted difference). Also plots the loss as a function of time. Args: keys (list): which keys to plot (default, all) width (float): bar width font_size (float): size of font fig_args (dict): passed to pl.figure() axis_args (dict): passed to pl.subplots_adjust() plot_args (dict): passed to pl.plot() do_show (bool): whether to show the plot ''' fig_args = sc.mergedicts(dict(figsize=(36,22)), fig_args) axis_args = sc.mergedicts(dict(left=0.05, right=0.95, bottom=0.05, top=0.95, wspace=0.3, hspace=0.3), axis_args) plot_args = sc.mergedicts(dict(lw=4, alpha=0.5, marker='o'), plot_args) pl.rcParams['font.size'] = font_size if keys is None: keys = self.keys + self.custom_keys n_keys = len(keys) loss_ax = None colors = sc.gridcolors(n_keys) n_rows = 3 fig = pl.figure(**fig_args) pl.subplots_adjust(**axis_args) for k,key in enumerate(keys): if key in self.keys: # It's a time series, plot with days and dates days = self.inds.sim[key] # The "days" axis (or not, for custom keys) daylabel = 'Timestep' else: #It's custom, we don't know what it is days = np.arange(len(self.losses[key])) # Just use indices daylabel = 'Index' pl.subplot(n_rows, n_keys, k+0*n_keys+1) pl.plot(days, self.pair[key].data, c='k', label='Data', **plot_args) pl.plot(days, self.pair[key].sim, c=colors[k], label='Simulation', **plot_args) pl.title(key) if k == 0: pl.ylabel('Time series (counts)') pl.legend() pl.subplot(n_rows, n_keys, k+1*n_keys+1) pl.bar(days, self.diffs[key], width=width, color=colors[k], label='Difference') pl.axhline(0, c='k') if k == 0: pl.ylabel('Differences (counts)') pl.legend() loss_ax = pl.subplot(n_rows, n_keys, k+2*n_keys+1, sharey=loss_ax) pl.bar(days, self.losses[key], width=width, color=colors[k], label='Losses') pl.xlabel(daylabel) pl.title(f'Total loss: {self.losses[key].sum():0.3f}') if k == 0: pl.ylabel('Losses') pl.legend() if do_show: pl.show() return fig
[docs] def compute_gof(actual, predicted, normalize=True, use_frac=False, use_squared=False, as_scalar='none', eps=1e-9, skestimator=None, **kwargs): ''' Calculate the goodness of fit. By default use normalized absolute error, but highly customizable. For example, mean squared error is equivalent to setting normalize=False, use_squared=True, as_scalar='mean'. Args: actual (arr): array of actual (data) points predicted (arr): corresponding array of predicted (model) points normalize (bool): whether to divide the values by the largest value in either series use_frac (bool): convert to fractional mismatches rather than absolute use_squared (bool): square the mismatches as_scalar (str): return as a scalar instead of a time series: choices are sum, mean, median eps (float): to avoid divide-by-zero skestimator (str): if provided, use this scikit-learn estimator instead kwargs (dict): passed to the scikit-learn estimator Returns: gofs (arr): array of goodness-of-fit values, or a single value if as_scalar is True **Examples**:: x1 = np.cumsum(np.random.random(100)) x2 = np.cumsum(np.random.random(100)) e1 = compute_gof(x1, x2) # Default, normalized absolute error e2 = compute_gof(x1, x2, normalize=False, use_frac=False) # Fractional error e3 = compute_gof(x1, x2, normalize=False, use_squared=True, as_scalar='mean') # Mean squared error e4 = compute_gof(x1, x2, skestimator='mean_squared_error') # Scikit-learn's MSE method e5 = compute_gof(x1, x2, as_scalar='median') # Normalized median absolute error -- highly robust ''' # Handle inputs actual = np.array(sc.dcp(actual), dtype=float) predicted = np.array(sc.dcp(predicted), dtype=float) # Custom estimator is supplied: use that if skestimator is not None: # pragma: nocover try: import sklearn.metrics as sm sklearn_gof = getattr(sm, skestimator) # Shortcut to e.g. sklearn.metrics.max_error except ImportError as E: raise ImportError(f'You must have scikit-learn >=0.22.2 installed: {str(E)}') except AttributeError: raise AttributeError(f'Estimator {skestimator} is not available; see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter for options') gof = sklearn_gof(actual, predicted, **kwargs) return gof # Default case: calculate it manually else: # Key step -- calculate the mismatch! gofs = abs(np.array(actual) - np.array(predicted)) if normalize and not use_frac: actual_max = abs(actual).max() if actual_max>0: gofs /= actual_max if use_frac: if (actual<0).any() or (predicted<0).any(): print('Warning: Calculating fractional errors for non-positive quantities is ill-advised!') else: maxvals = np.maximum(actual, predicted) + eps gofs /= maxvals if use_squared: gofs = gofs**2 if as_scalar == 'sum': gofs = np.sum(gofs) elif as_scalar == 'mean': gofs = np.mean(gofs) elif as_scalar == 'median': gofs = np.median(gofs) return gofs
[docs] def diff_summaries(sim1, sim2, skip_key_diffs=False, output=False, die=False): ''' Compute the difference of the summaries of two FPsim calibration objects, and print any values which differ. Args: sim1 (sim/dict): the calib.summary dictionary, representing a single sim sim2 (sim/dict): ditto skip_key_diffs (bool): whether to skip keys that don't match between sims output (bool): whether to return the output as a string (otherwise print) die (bool): whether to raise an exception if the sims don't match require_run (bool): require that the simulations have been run **Example**:: c1 = fp.Calibration() c2 = fp.Calibration() c1.run() c2.run() fp.diff_summaries(c1.summarize(), c2.summarize()) ''' for sim in [sim1, sim2]: if not isinstance(sim, dict): # pragma: no cover errormsg = f'Cannot compare object of type {type(sim)}, must be a FPsim calib.summary dict' raise TypeError(errormsg) # Ignore data for now sim1 = sim1['model'] sim2 = sim2['model'] # Compare keys keymatchmsg = '' sim1_keys = set(sim1.keys()) sim2_keys = set(sim2.keys()) #if sim1_keys != _keys and not skip_key_diffs: # pragma: no cover #keymatchmsg = "Keys don't match!\n" # missing = list(sim1_keys - sim2_keys) #extra = list(sim2_keys - sim1_keys) #if missing: #keymatchmsg += f' Missing sim1 keys: {missing}\n' #if extra: #keymatchmsg += f' Extra sim2 keys: {extra}\n' # Compare values valmatchmsg = '' mismatches = {} for key in sim2.keys(): # To ensure order if key in sim1_keys: # If a key is missing, don't count it as a mismatch sim1_val = sim1[key] if key in sim1 else 'not present' sim2_val = sim2[key] if key in sim2 else 'not present' both_nan = sc.isnumber(sim1_val, isnan=True) and sc.isnumber(sim2_val, isnan=True) if sim1_val != sim2_val and not both_nan: mismatches[key] = {'sim1': sim1_val, 'sim2': sim2_val} if len(mismatches): # pragma: nocover valmatchmsg = '\nThe following values differ between the two simulations:\n' df = pd.DataFrame.from_dict(mismatches).transpose() diff = [] ratio = [] change = [] small_change = 1e-3 # Define a small change, e.g. a rounding error for mdict in mismatches.values(): old = mdict['sim1'] new = mdict['sim2'] numeric = sc.isnumber(sim1_val) and sc.isnumber(sim2_val) if numeric and old>0: this_diff = new - old this_ratio = new/old abs_ratio = max(this_ratio, sc.safedivide(1.0, this_ratio, np.inf)) # Set the character to use if abs_ratio<small_change: change_char = '≈' elif new > old: change_char = '↑' elif new < old: change_char = '↓' else: errormsg = f'Could not determine relationship between sim1={old} and sim2={new}' raise ValueError(errormsg) # Set how many repeats it should have repeats = 1 if abs_ratio >= 1.1: repeats = 2 if abs_ratio >= 2: repeats = 3 if abs_ratio >= 10: repeats = 4 this_change = change_char*repeats else: # pragma: no cover this_diff = np.nan this_ratio = np.nan this_change = 'N/A' diff.append(this_diff) ratio.append(this_ratio) change.append(this_change) df['diff'] = diff df['ratio'] = ratio for col in ['sim1', 'sim2', 'diff', 'ratio']: df[col] = df[col].round(decimals=3) df['change'] = change valmatchmsg += str(df) # Raise an error if mismatches were found mismatchmsg = keymatchmsg + valmatchmsg if mismatchmsg: # pragma: no cover if die: raise ValueError(mismatchmsg) elif output: return mismatchmsg else: print(mismatchmsg) else: if not output: print('Sims match') return