Source code for hpvsim.population

'''
Defines functions for making the population.
'''

#%% Imports
import numpy as np
import sciris as sc
from . import utils as hpu
from . import misc as hpm
from . import data as hpdata
from . import defaults as hpd
from . import people as hpppl


# Specify all externally visible functions this file defines
__all__ = ['make_people', 'make_contacts']



[docs]
def make_people(sim, popdict=None, reset=False, verbose=None, use_age_data=True,
                sex_ratio=0.5, dt_round_age=True, microstructure=None,
                age_data=None, pop_trend=None, pop_age_trend=None, **kwargs):
    '''
    Make the people for the simulation.

    Usually called via :py:func:`hpvsim.sim.Sim.initialize`.

    Args:
        sim      (Sim)  : the simulation object; population parameters are taken from the sim object
        popdict  (any)  : if supplied, use this population dictionary instead of generating a new one; can be a dict or People object
        reset    (bool) : whether to force population creation even if self.popdict/self.people exists
        verbose  (bool) : level of detail to print
        use_age_data (bool):
        sex_ratio (bool):
        dt_round_age (bool): whether to round people's ages to the nearest timestep (default true)

    Returns:
        people (People): people
    '''

    # Set inputs and defaults
    n_agents = int(sim['n_agents']) # Shorten
    total_pop = None # Optionally created but always returned
    pop_trend = None # Populated later if location is specified
    pop_age_trend = None

    if verbose is None:
        verbose = sim['verbose']
    dt = sim['dt'] # Timestep

    # If a people object or popdict is supplied, use it
    if sim.people and not reset:
        sim.people.initialize(sim_pars=sim.pars)
        return sim.people, total_pop # If it's already there, just return
    elif sim.popdict and popdict is None:
        popdict = sim.popdict # Use stored one
        sim.popdict = None # Once loaded, remove

    if popdict is None:

        n_agents = int(sim['n_agents']) # Number of people
        total_pop = None

        # Handle data on population size by age & over time.
        # If a country location is supplied, this will be loaded in automatically from UN projections.
        # If a custom location is supplied, these data should be provided in datafiles read in by the pars.
        # Other demographic data like mortality and fertility are also available by
        # country, but these are loaded directly into the sim since they are not
        # stored as part of the people.
        location = sim['location']
        if sim['verbose']:
            print(f'Loading location-specific data for "{location}"')
        if use_age_data:
            try:
                age_data = hpdata.get_age_distribution(location, year=sim['start'], age_datafile=sim['age_datafile'])
                pop_trend = hpdata.get_total_pop(location, pop_datafile=sim['pop_datafile'])
                pop_age_trend = hpdata.get_age_distribution_over_time(location, popage_datafile=sim['popage_datafile'])
                total_pop = sum(age_data[:, 2])  # Return the total population
            except ValueError as E:
                warnmsg = f'Could not load age data for requested location "{location}" ({str(E)})'
                hpm.warn(warnmsg, die=True)

        # Set ages, rounding to nearest timestep if requested
        age_data_min   = age_data[:,0]
        age_data_max   = age_data[:,1]
        age_data_range = age_data_max - age_data_min
        age_data_prob   = age_data[:,2]
        age_data_prob   /= age_data_prob.sum() # Ensure it sums to 1
        age_bins        = hpu.n_multinomial(age_data_prob, n_agents) # Choose age bins

        if dt_round_age:
            ages = age_data_min[age_bins] + np.random.randint(age_data_range[age_bins]/dt)*dt # Uniformly distribute within this age bin
        else:
            ages = age_data_min[age_bins] + age_data_range[age_bins]*np.random.random(n_agents) # Uniformly distribute within this age bin

        uids, sexes, debuts, rel_sev, partners, cluster = set_static(n_agents, pars=sim.pars, sex_ratio=sim.pars['sex_ratio'])

        # Store output
        popdict = {}
        popdict['uid'] = uids
        popdict['age'] = ages
        popdict['sex'] = sexes
        popdict['debut'] = debuts
        popdict['rel_sev'] = rel_sev
        popdict['partners'] = partners
        popdict['cluster'] = cluster

        is_active = ages > debuts
        is_female = sexes == 0

        # Create the contacts
        lkeys = sim['acts'].keys() # TODO: consider a more robust way to do this
        if microstructure in ['random', 'default']:
            contacts = dict()
            current_partners = np.zeros((len(lkeys),n_agents))
            lno=0
            for lkey in lkeys:
                contacts[lkey], current_partners,_,_ = make_contacts(
                    lno=lno, tind=0, partners=partners[lno,:], current_partners=current_partners, ages=ages,
                    debuts=debuts, is_female=is_female, is_active=is_active, mixing=sim['mixing'][lkey],
                    layer_probs=sim['layer_probs'][lkey], f_cross_layer=sim['f_cross_layer'], m_cross_layer=sim['m_cross_layer'],
                    durations=sim['dur_pship'][lkey], acts=sim['acts'][lkey], age_act_pars=sim['age_act_pars'][lkey],
                    cluster=cluster, add_mixing=sim['add_mixing'], **kwargs
                )
                lno += 1

        else:
            errormsg = f'Microstructure type "{microstructure}" not found; choices are random or TBC'
            raise NotImplementedError(errormsg)

        popdict['contacts'] = contacts
        popdict['current_partners'] = current_partners

    else:
        ages = popdict['age']

    # Do minimal validation and create the people
    validate_popdict(popdict, sim.pars, verbose=verbose)
    people = hpppl.People(sim.pars, pop_trend=pop_trend, pop_age_trend=pop_age_trend, **popdict) # List for storing the people

    sc.printv(f'Created {n_agents} agents, average age {ages.mean():0.2f} years', 2, verbose)

    return people, total_pop



def partner_count(n_agents=None, partner_pars=None):
    '''
    Assign each person a preferred number of concurrent partners for each layer

    Args:
        n_agents    (int)   : number of agents
        layer_keys  (list)  : list of layers
        means       (dict)  : dictionary keyed by layer_keys with mean number of partners per layer
        sample      (bool)  : whether or not to sample the number of partners

    Returns:
        p_count (dict): the number of partners per person per layer
    '''

    # Initialize output
    partners = []

    # Set the number of partners
    for lkey,ppars in partner_pars.items():
        p_count = hpu.sample(**ppars, size=n_agents)
        partners.append(p_count)


    return np.array(partners)


def set_static(new_n, existing_n=0, pars=None, sex_ratio=0.5):
    '''
    Set static population characteristics that do not change over time.
    Can be used when adding new births, in which case the existing popsize can be given.
    '''
    uid                 = np.arange(existing_n, existing_n+new_n, dtype=hpd.default_int)
    sex                 = np.random.binomial(1, sex_ratio, new_n)
    debut               = np.full(new_n, np.nan, dtype=hpd.default_float)
    debut[sex==1]       = hpu.sample(**pars['debut']['m'], size=sum(sex))
    debut[sex==0]       = hpu.sample(**pars['debut']['f'], size=new_n-sum(sex))
    n_layers            = len(pars['m_partners'].keys())
    partners            = np.full((n_layers, new_n), 0, dtype=hpd.default_int)
    partners[:, sex==1] = partner_count(n_agents=sum(sex), partner_pars=pars['m_partners'])
    partners[:, sex==0] = partner_count(n_agents=new_n-sum(sex), partner_pars=pars['f_partners'])
    cluster             = hpu.n_multinomial(pars['cluster_rel_sizes'], new_n).astype(int)
    rel_sev             = hpu.sample(**pars['sev_dist'], size=new_n) # Draw individual relative susceptibility factors

    return uid, sex, debut, rel_sev, partners, cluster


def validate_popdict(popdict, pars, verbose=True):
    '''
    Check that the popdict is the correct type, has the correct keys, and has
    the correct length
    '''

    # Check it's the right type
    try:
        popdict.keys() # Although not used directly, this is used in the error message below, and is a good proxy for a dict-like object
    except Exception as E:
        errormsg = f'The popdict should be a dictionary or hpv.People object, but instead is {type(popdict)}'
        raise TypeError(errormsg) from E

    # Check keys and lengths
    required_keys = ['uid', 'age', 'sex', 'debut']
    popdict_keys = popdict.keys()
    n_agents = pars['n_agents']
    for key in required_keys:

        if key not in popdict_keys:
            errormsg = f'Could not find required key "{key}" in popdict; available keys are: {sc.strjoin(popdict.keys())}'
            sc.KeyNotFoundError(errormsg)

        actual_size = len(popdict[key])
        if actual_size != n_agents:
            errormsg = f'Could not use supplied popdict since key {key} has length {actual_size}, but all keys must have length {n_agents}'
            raise ValueError(errormsg)

        isnan = np.isnan(popdict[key]).sum()
        if isnan:
            errormsg = f'Population not fully created: {isnan:,} NaNs found in {key}.'
            raise ValueError(errormsg)

    return


def _tidy_edgelist(f, m, mapping=None):
    ''' Helper function to convert lists to arrays and optionally map arrays '''
    if mapping is not None:
        mapping = np.array(mapping, dtype=hpd.default_int)
        m = mapping[m]
        f = mapping[f]
    output = dict(f=f, m=m)
    return output


def age_scale_acts(acts=None, age_act_pars=None, age_f=None, age_m=None, debut_f=None, debut_m=None):
    ''' Scale the number of acts for each relationship according to the age of the partners '''

    # For each couple, get the average age they are now and the average age of debut
    avg_age     = np.array([age_f, age_m]).mean(axis=0)
    avg_debut   = np.array([debut_f, debut_m]).mean(axis=0)

    # Shorten parameter names
    dr = age_act_pars['debut_ratio']
    peak = age_act_pars['peak']
    rr = age_act_pars['retirement_ratio']
    retire = age_act_pars['retirement']

    # Get indices of people at different stages
    below_peak_inds = avg_age <=  age_act_pars['peak']
    above_peak_inds = (avg_age >  age_act_pars['peak']) & (avg_age <  age_act_pars['retirement'])
    retired_inds    = avg_age >  age_act_pars['retirement']

    # Set values by linearly scaling the number of acts for each partnership according to
    # the age of the couple at the commencement of the relationship
    below_peak_vals = acts[below_peak_inds]* (dr + (1-dr)/(peak - avg_debut[below_peak_inds]) * (avg_age[below_peak_inds] - avg_debut[below_peak_inds]))
    above_peak_vals = acts[above_peak_inds]* (rr + (1-rr)/(peak - retire)                     * (avg_age[above_peak_inds] - retire))
    retired_vals = 0

    # Set values and return
    scaled_acts = np.full(len(acts), np.nan, dtype=hpd.default_float)
    scaled_acts[below_peak_inds] = below_peak_vals
    scaled_acts[above_peak_inds] = above_peak_vals
    scaled_acts[retired_inds] = retired_vals

    return scaled_acts


def create_edgelist(tind, lno, partners, current_partners, mixing, age, is_active, is_female,
                        layer_probs, f_cross_layer, m_cross_layer, cluster, add_mixing):
    '''
    Create partnerships for a single layer. Eligible females and males are first identified if they are sexually active,
    underpartnered, and is available to partner for the specified type of relationship given their concurrency preference
    and whether they already have partners in other relationship layers. Concurrency preference is randomly assigned.
    Eligible individuals are randomly then chosen based on age- and sex-specific participation rates. The pairing
    algorithm iterates through each cluster and age bin of females, calculating relative preference of all participating
    males by multiplying the cluster- and age-specific mixing weights. Male partners are then randomly drawn based on
    these relative mixing weights. If there are not enough male partners to satiate all females in a cluster and age bin,
    females are randomly dropped with no new partner in this layer and time step.

    Args:
        lno                 (int): index of network layer
        partners            (int arr): array containing each agent's desired number of partners in this layer
        current_partners    (int arr): array containing each agent's actual current number of partners in this layer
        mixing              (float arr): age mixing matrix
        age                 (float arr): age
        is_active           (bool arr): whether or not people are sexually active
        is_female           (bool arr): whether each person is female
        layer_probs         (float arr): participation rates in this layer by age and sex
        cross_layer         (float): proportion of agents that have cross-layer relationships
        cluster             (int arr): array containing each agent's cluster id
        add_mixing          (float arr): additional mixing matrix
    '''

    # Useful variables
    n_agents        = len(age)
    n_layers        = current_partners.shape[0]
    f_active        =  is_female & is_active
    m_active        = ~is_female & is_active
    underpartnered  = current_partners[lno, :] < partners  # Indices of underpartnered people

    # Figure out how many new relationships to create by calculating the number of agents
    # who are underpartnered in this layer and either unpartnered in other layers or available
    # for cross-layer participation

    other_layers            = np.delete(np.arange(n_layers), lno)  # Indices of all other layers but this one
    other_partners          = current_partners[other_layers, :].any(axis=0)  # Whether or not people already partnered in other layers
    f_with_other_partners   = hpu.true(other_partners & is_female) # Indices of sexually active females with partners in other layers
    m_with_other_partners   = hpu.true(other_partners & ~is_female) # Indices of sexually active males with partners in other layers
    f_cross_inds            = hpu.binomial_filter(f_cross_layer, f_with_other_partners) # Indices who have cross-layer relationships
    m_cross_inds            = hpu.binomial_filter(m_cross_layer, m_with_other_partners) # Indices who have cross-layer relationships
    cross_layer_bools       = np.full(n_agents, False, dtype=bool) # Construct a boolean array indicating whether people have cross-layer relationships
    cross_layer_bools[f_cross_inds]  = True
    cross_layer_bools[m_cross_inds]  = True
    f_eligible              = f_active & underpartnered & (~other_partners | cross_layer_bools)
    m_eligible              = m_active & underpartnered & (~other_partners | cross_layer_bools)

    # Bin the females by age
    bins = layer_probs[0, :]  # Extract age bins
    cluster_range = np.unique(cluster)
    # Initialize
    f = np.array([], dtype=int)
    m = np.array([], dtype=int)
    # Filter males by participation rates
    m_eligible_inds = hpu.true(m_eligible)  # Inds of all eligible males across clusters
    m_participants = hpu.participation_filter(m_eligible_inds, age, layer_probs[2, :], bins=layer_probs[0, :])
    age_bins_m = np.digitize(age[m_participants], bins=bins) - 1  # Age bins of participating males

    # shuffle clusters
    np.random.shuffle(cluster_range)
    m_probs = np.ones(n_agents)  # Begin by assigning everyone equal probability of forming a new relationship

    for cl in cluster_range: # Loop through clusters for females
        # Try randomly select females for pairing
        f_eligible_inds = hpu.true(f_eligible * (cluster==cl))  # Inds of all eligible females in this cluster
        f_cl = hpu.participation_filter(f_eligible_inds, age, layer_probs[1, :], bins=layer_probs[0, :])
        if len(f_cl):
            # Draw male partners based on mixing matrices
            age_bins_f = np.digitize(age[f_cl], bins=bins) - 1  # Age bins of participating females in this cluster
            bin_range_f, males_needed = np.unique(age_bins_f, return_counts=True)  # For each female age bin, how many females need partners?
            # shuffle age bins
            bin_order = np.arange(len(bin_range_f))
            np.random.shuffle(bin_order)
            for ab, nm in zip(bin_range_f[bin_order], males_needed[bin_order]):  # Loop through the age bins of females and the number of males needed for each
                male_dist = mixing[:, ab + 1]  # Get female preferences for male partners in this age bin
                # Weight males according to the preferences of females of this age
                this_weighting = m_probs[m_participants] * male_dist[age_bins_m] * add_mixing[cl, cluster[m_participants]]
                if this_weighting.sum() > 0:
                    males_nonzero = hpu.true(this_weighting)  # Remove males with 0 weights
                    this_weighting_nonzero = this_weighting[males_nonzero]
                    f_inds = f_cl[age_bins_f == ab]  # inds of participating females in this age bin
                    if nm > len(this_weighting_nonzero):
                        #print(f'Warning, {nm} males desired but only {len(this_weighting_nonzero)} found.')
                        f_selected = f_inds[hpu.choose(nm, len(this_weighting_nonzero))] # randomly select females
                        nm = f_selected.size # number of new partnerships in this age bin
                    else:
                        f_selected = f_inds
                    m_selected = m_participants[males_nonzero[hpu.choose_w(this_weighting_nonzero, nm)]]  # Select males based on mixing weights
                    m_probs[m_selected] = 0 # remove males that get partnered
                    m = np.concatenate((m, m_selected)) # save selected males
                    f = np.concatenate((f, f_selected))
    # Count how many contacts there actually are
    new_pship_inds, new_pship_counts = np.unique(np.concatenate([f, m]), return_counts=True)
    if len(new_pship_inds) > 0:
        current_partners[lno, new_pship_inds] += new_pship_counts

    return f, m, current_partners, new_pship_inds, new_pship_counts




[docs]
def make_contacts(lno=None, tind=None, partners=None, current_partners=None,
                  ages=None, debuts=None, is_female=None, is_active=None,
                  mixing=None, layer_probs=None, f_cross_layer=None, m_cross_layer=None,
                  durations=None, acts=None, age_act_pars=None,
                  cluster=None, add_mixing=None):
    '''
    Make contacts for a single layer as an edgelist. This will select sexually
    active male partners for sexually active females using age structure if given.
    '''

    # Create edgelist
    f,m,current_partners,new_pship_inds,new_pship_counts = create_edgelist(
        tind, lno, partners, current_partners, mixing, ages, is_active, is_female,
        layer_probs, f_cross_layer, m_cross_layer, cluster, add_mixing)

    # Convert edgelist into Contacts dict, with info about each partnership's duration,
    # coital frequency, etc
    output = {}

    if len(f) and len(m):
        # Scale number of acts by age of couple
        acts = hpu.sample(**acts, size=len(f))
        kwargs = dict(acts=acts,
                      age_act_pars=age_act_pars,
                      age_f=ages[f],
                      age_m=ages[m],
                      debut_f=debuts[f],
                      debut_m=debuts[m]
                      )

        scaled_acts = age_scale_acts(**kwargs)
        keep_inds = scaled_acts>0 # Discard partnerships with zero acts (e.g. because they are "post-retirement")
        m = m[keep_inds]
        f = f[keep_inds]
        scaled_acts = scaled_acts[keep_inds]

        # Tidy up and add durations and start dates
        output = _tidy_edgelist(f, m)
        n_partnerships = len(output['m'])
        output['age_f'] = ages[f]
        output['age_m'] = ages[m]
        output['dur'] = hpu.sample(**durations, size=n_partnerships)
        output['acts'] = scaled_acts
        output['start'] = np.array([tind] * n_partnerships, dtype=hpd.default_float)
        output['end'] = output['start'] + output['dur']
        output['cluster_f'] = cluster[f]
        output['cluster_m'] = cluster[m]

    return output, current_partners, new_pship_inds, new_pship_counts