Source code for synthpops.data_distributions

"""
Read in data distributions.
"""

import os
import json
import numpy as np
import pandas as pd
import sciris as sc
from collections import Counter
from . import base as spb
from . import config as cfg
from . import defaults
from . import logger
from . import data


[docs]def get_relative_path(datadir): """ Get the path relative for the datadir. Args: datadir (str): path to a specified data directory Returns: str: A path relative to a specified data directory datadir """ base_dir = datadir if len(defaults.settings.relative_path) > 1: base_dir = os.path.join(datadir, *defaults.settings.relative_path) return base_dir
[docs]def get_nbrackets(): """Return the default number of age brackets.""" return defaults.settings.nbrackets
[docs]def calculate_which_nbrackets_to_use(location_data, nbrackets=None): """ Calculate the number of age brackets to use by default. Args: nbrackets (int): the number of age brackets to use Returns: int: The number of age brackets to use. """ if nbrackets is None: nbrackets = [d.num_bins for d in location_data.population_age_distributions if d.num_bins is not None] if len(nbrackets): nbrackets = max(nbrackets) else: nbrackets = defaults.settings.nbrackets return nbrackets
[docs]def sanitize_location(location): """ Process and return a valid name for a location. Args: location (str): name of the location Returns: str: A processed location name. """ if location is None: return "" else: # No spaces in filenames. location = location.replace(" ", "_") # Our convention is to separate location segments with "-". location = location.replace("-", "_") return location
[docs]def calculate_location_filename(location, state_location, country_location): """ Process a location filename. Args: location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in Returns: str: A filename for where the location data reside. """ separator = "-" if location != "": filepath = separator.join([country_location, state_location, location]) elif state_location != "": filepath = separator.join([country_location, state_location]) else: filepath = country_location return filepath
[docs]def calculate_location_filepath(location, state_location, country_location): """ Process a location filepath. Args: location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in Returns: str: A filename for where the location data reside. """ logger.debug(f"Calculating filepath for (location, state_location, country_location) = " f"({location}, {state_location}, {country_location})") location = sanitize_location(location) state_location = sanitize_location(state_location) country_location = sanitize_location(country_location) filename = calculate_location_filename(location, state_location, country_location) filename = f"{filename}.json" filepath = filename logger.debug(f"Filepath = {filepath}") return filepath
[docs]def load_location(specific_location, state_location, country_location, revert_to_default=None): """ Loading json object for the location data. Args: specific_location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in revert_to_default (bool) : If True, try to first find location specific data to return otherwise use default data specified by the default location Returns: str: A filename for where the location data reside. """ if revert_to_default is None: revert_to_default = False location_filepath = calculate_location_filepath(specific_location, state_location, country_location) try: location_object = data.load_location_from_filepath(location_filepath) logger.debug(f"Loaded (location, state_location, country_location) = " f"({specific_location}, {state_location}, {country_location}) " f"from [{location_filepath}]") return location_object except: logger.warn(f"Failed to load location [{specific_location}], " f"state_location [{state_location}], " f"country_location [{country_location}], reverting to default.") if revert_to_default: return load_location(defaults.settings.location, defaults.settings.state_location, defaults.settings.country_location, revert_to_default=False) else: msg = f"Data unavailable for " \ f"(location, state_location, country_location) = " \ f"({specific_location}, {state_location}, {country_location}). " \ f"Please check input strings, or set use_default to True to use the default values from " \ f"(location, state_location, country_location) = " \ f"({defaults.settings.location}, {defaults.settings.state_location}, {defaults.settings.country_location}). " raise NotImplementedError(msg)
[docs]def read_age_bracket_distr(datadir=None, location=None, state_location=None, country_location=None, nbrackets=None, file_path=None, use_default=False): """ A dict of the age distribution by age brackets. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified age bracket distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from the settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the age distribution by age bracket. Keys map to a range of ages in that age bracket. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) nbrackets = calculate_which_nbrackets_to_use(location_data, nbrackets) age_brackets = location_data.get_population_age_distribution(nbrackets) # Use default if no data for this parameter. if use_default and (age_brackets is None or len(age_brackets) == 0): return read_age_bracket_distr(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) percent = [age_bracket[2] for age_bracket in age_brackets] r = dict(zip(np.arange(len(age_brackets)), percent)) return r
# TODO: need to adapt this to new data.py
[docs]def get_smoothed_single_year_age_distr(datadir=None, location=None, state_location=None, country_location=None, nbrackets=None, file_path=None, use_default=False, window_length=7): """ A smoothed dict of the age distribution by single years. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Using moving windows to smooth out the age distribution. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified age bracket distribution data use_default (bool) : If True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from the settings.location, settings.state_location, settings.country_location. window_length (int) : length of window, in units of years, over which to average or smooth out age distribution Returns: dict: A dictionary of the age distribution by age bracket. Keys map to a range of ages in that age bracket. """ age_bracket_distr = read_age_bracket_distr(datadir, location, state_location, country_location, nbrackets, file_path, use_default) age_brackets = get_census_age_brackets(datadir, country_location=country_location, state_location=state_location, location=location, nbrackets=nbrackets) age_by_brackets = spb.get_age_by_brackets(age_brackets) raw_age_distr = dict.fromkeys(age_by_brackets.keys(), 0) for a in raw_age_distr.keys(): b = age_by_brackets[a] raw_age_distr[a] = age_bracket_distr[b] / len(age_brackets[b]) smoothed_age_distr = raw_age_distr.copy() errormsg = f"The window_length should be a non-negative integer value less than 10. The supplied value is: {window_length}. Please try another value between 0 and 10." if not isinstance(window_length, (int, np.int32, np.int64)) or window_length < 0 or window_length >= 10: raise ValueError(errormsg) window_half = window_length // 2 for a in range(window_half, max(smoothed_age_distr.keys()) - window_half + 1): smoothed_age_distr[a] = np.mean([raw_age_distr[ai] for ai in range(a - window_half, a + window_half + 1)]) # check all values are greater than 0 min_smoothed_val = min(smoothed_age_distr.values()) if min_smoothed_val < 0: errormsg2 = f"The minimum value of the smoothed age distribution is: {min_smoothed_val}. All values of the distribution should be greater than or equal to 0. Check either the original age distribution or the window_length." raise ValueError(errormsg2) smoothed_age_distr = spb.norm_dic(smoothed_age_distr) return smoothed_age_distr
[docs]def get_household_size_distr(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ A dictionary of the distribution of household sizes. If you don't give the file_path, then supply the location, state_location, and country_location strings. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified household size distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the household size distribution data. Keys map to the household size as an integer, values are the percent of households of that size. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.household_size_distribution is None or len(location_data.household_size_distribution) == 0): return get_household_size_distr(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) dist = [ [int(entry[0]), entry[1]] for entry in location_data.household_size_distribution ] r = dict(dist) return r
[docs]def get_head_age_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get a dictionary of head age brackets either from the file_path directly, or using the other parameters to figure out what the file_path should be. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state country_location (string) : name of the country the state_location is in file_path (string) : file path to user specified head age brackets data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from the settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the age brackets for head of household distribution data. Keys map to the age bracket as an integer, values are the percent of households which head of household in that age bracket. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.household_head_age_brackets is None or len(location_data.household_head_age_brackets) == 0): return get_head_age_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) age_brackets = {} for [bracket_index, bracket_minmax] in enumerate(location_data.household_head_age_brackets): age_brackets[bracket_index] = np.arange(int(bracket_minmax[0]), int(bracket_minmax[1]) + 1) return age_brackets
[docs]def get_head_age_by_size_distr(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Create an array of head of household age bracket counts (column) given by size (row). If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from the settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state country_location (string) : name of the country the state_location is in file_path (string) : file path to user specified age of the head of the household by household size distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: ndarray: An array where each row s represents the age distribution of the head of households for households of size s-1. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.household_head_age_distribution_by_family_size is None or len(location_data.household_head_age_distribution_by_family_size) == 0): return get_head_age_by_size_distr(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) dist = [d[1:] for d in location_data.household_head_age_distribution_by_family_size] return np.array(dist)
[docs]def get_census_age_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False, nbrackets=None): """ Get census age brackets: depends on the country or source of the age distribution and the contact pattern data. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state country_location (string) : name of the country the state_location is in file_path (string) : file path to user specified census age brackets use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the range of ages that map to each age bracket. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # nbrackets = calculate_which_nbrackets_to_use(nbrackets) nbrackets = calculate_which_nbrackets_to_use(location_data, nbrackets) dist = location_data.get_population_age_distribution(nbrackets) # Use default if no data for this parameter. if use_default and (dist is None or len(dist) == 0): return get_census_age_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) age_brackets = {} for bracket_index, dist in enumerate(dist): age_min = int(dist[0]) age_max = int(dist[1]) age_brackets[bracket_index] = np.arange(age_min, age_max + 1) return age_brackets
# TODO: still open question on how to handle these.
[docs]def get_contact_matrix(datadir, setting_code, sheet_name=None, file_path=None, delimiter=' ', header=None): """ Get setting specific age contact matrix given sheet name to use. If file_path is given, then delimiter and header should also be specified. Args: datadir (string) : file path to the data directory setting_code (string) : name of the physial contact setting: H for households, S for schools, W for workplaces, C for community or other sheet_name (string) : name of the sheet in the excel file with contact patterns file_path (string) : file path to user specified age contact matrix delimiter (string) : delimter for the contact matrix file header (int) : row number for the header of the file Returns: ndarray: Matrix of contact patterns where each row i is the average contact patterns for an individual in age bracket i and the columns represent the age brackets of their contacts. The matrix element i,j is then the contact rate, number, or frequency for the average individual in age bracket i with all of their contacts in age bracket j in that physical contact setting. """ if file_path is None: setting_names = {'H': 'home', 'S': 'school', 'W': 'work', 'C': 'other_locations'} base_dir = get_relative_path(datadir) if setting_code in setting_names: file_path = os.path.join(base_dir, 'MUestimates_' + setting_names[setting_code] + '_1.xlsx') try: # Shortcut: use pre-processed data obj_path = file_path.replace('_1.xlsx', '.obj').replace('_2.xlsx', '.obj') data = sc.loadobj(obj_path) arr = data[sheet_name] return arr except Exception as E: errormsg = f'Warning: could not load pickled data ({str(E)}), defaulting to Excel...' print(errormsg) try: df = pd.read_excel(file_path, sheet_name=sheet_name, header=0) except: file_path = file_path.replace('_1.xlsx', '_2.xlsx') df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) return np.array(df) else: raise NotImplementedError("Invalid setting code. Try again.") else: try: df = pd.read_csv(file_path, delimiter=delimiter, header=header) return np.array(df) except: raise NotImplementedError("Contact matrix did not open. Check inputs.")
# TODO: still open question on how to handle these.
[docs]def get_contact_matrices(datadir=None, sheet_name=None, file_path_dic=None, delimiter=' ', header=None, use_default=False): # need review for additional countries """ Create a dict of setting specific age contact matrices. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.sheet_name. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory setting_code (string) : name of the physial contact setting: H for households, S for schools, W for workplaces, C for community or other sheet_name (string) : name of the sheet in the excel file with contact patterns file_path_dic (string) : dictionary to file paths of user specified age contact matrix, where keys are "H", "S", "W", and "C". delimiter (string) : delimter for the contact matrix file header (int) : row number for the header of the file Returns: dict: A dictionary of the different contact matrices for each population, given by the sheet name. Keys map to the different possible physical contact settings for which data are available. """ matrices = {} if file_path_dic is None: file_path_dic = dict.fromkeys(['H', 'S', 'W', 'C'], None) try: for setting_code in ['H', 'S', 'W', 'C']: matrices[setting_code] = get_contact_matrix(datadir, setting_code, sheet_name, file_path_dic[setting_code], delimiter, header) except: if use_default: for setting_code in ['H', 'S', 'W', 'C']: matrices[setting_code] = get_contact_matrix(datadir, setting_code, sheet_name=defaults.settings.sheet_name) else: raise NotImplementedError(f"Data unavailable for the location specified. Please check input strings or set use_default to True to use default values from the {defaults.settings.sheet_name}.") return matrices
[docs]def get_school_enrollment_rates(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get dictionary of enrollment rates by age. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified school enrollment by age data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of school enrollment rates by age. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.enrollment_rates_by_age is None or len(location_data.enrollment_rates_by_age) == 0): return get_school_enrollment_rates(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) dist = [ [int(d[0]), d[1]] for d in location_data.enrollment_rates_by_age ] return dict(dist)
[docs]def get_school_size_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get school size brackets: depends on the source/location of the data. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified school size brackets data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of school size brackets. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.school_size_brackets is None or len(location_data.school_size_brackets) == 0): return get_school_size_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) school_size_brackets = {} for bracket_index, bracket in enumerate(location_data.school_size_brackets): size_min = int(bracket[0]) size_max = int(bracket[1]) school_size_brackets[bracket_index] = np.arange(size_min, size_max + 1) return school_size_brackets
[docs]def get_school_size_distr_by_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get distribution of school sizes by size bracket or bin. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified school size distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the distribution of school sizes by bracket. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.school_size_distribution is None or len(location_data.school_size_distribution) == 0): return get_school_size_distr_by_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) size_distr = dict(enumerate(location_data.school_size_distribution)) size_distr = spb.norm_dic(size_distr) return size_distr
# ### Default school type data ### #
[docs]def get_default_school_type_age_ranges(): """ Define and return default school types and the age range for each. Return: dict: A dictionary of default school types and the age range for each. """ school_type_age_ranges = {} school_type_age_ranges['pk'] = np.arange(3, 6) school_type_age_ranges['es'] = np.arange(6, 11) school_type_age_ranges['ms'] = np.arange(11, 14) school_type_age_ranges['hs'] = np.arange(14, 18) school_type_age_ranges['uv'] = np.arange(18, 101) return school_type_age_ranges
[docs]def get_default_school_types_distr_by_age(): """ Define and return default probabilities of school type for each age. Return: dict: A dictionary of default probabilities for the school type likely for each age. """ school_type_age_ranges = get_default_school_type_age_ranges() school_types_distr_by_age = {} for a in range(101): school_types_distr_by_age[a] = dict.fromkeys(list(school_type_age_ranges.keys()), 0.) for k in school_type_age_ranges.keys(): for a in school_type_age_ranges[k]: school_types_distr_by_age[a][k] = 1. return school_types_distr_by_age
[docs]def get_default_school_types_by_age_single(): """ Define and return default school type by age by assigning the school type with the highest probability. Return: dict: A dictionary of default school type by age. """ school_types_distr_by_age = get_default_school_types_distr_by_age() school_types_by_age_single = {} for a in range(101): values_to_keys = {school_types_distr_by_age[a][k]: k for k in school_types_distr_by_age[a]} max_v = max(values_to_keys.keys()) max_k = values_to_keys[max_v] if max_v != 0: school_types_by_age_single[a] = max_k return school_types_by_age_single
[docs]def get_default_school_size_distr_brackets(): """ Define and return default school size distribution brackets. Return: dict: A dictionary of school size brackets. """ return get_school_size_brackets(defaults.settings.datadir, country_location=defaults.settings.country_location, state_location=defaults.settings.state_location, location=defaults.settings.location, use_default=True)
[docs]def get_default_school_size_distr_by_type(): """ Define and return default school size distribution for each school type. The school size distributions are binned to size groups or brackets. Return: dict: A dictionary of school size distributions binned by size groups or brackets for each type of default school. """ school_size_distr_by_type = {} school_types = ['pk', 'es', 'ms', 'hs', 'uv'] for k in school_types: school_size_distr_by_type[k] = get_school_size_distr_by_brackets(defaults.settings.datadir, country_location=defaults.settings.country_location, state_location=defaults.settings.state_location, location=defaults.settings.location, use_default=True) return school_size_distr_by_type
[docs]def get_school_type_age_ranges(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get a dictionary of the school types and the age range for each for the location specified. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from Seattle, Washington. Returns: dict: A dictionary of default school types and the age range for each. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.school_types_by_age is None or len(location_data.school_types_by_age) == 0): return get_school_type_age_ranges(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) school_type_age_ranges = dict() for school_type_by_age in location_data.school_types_by_age: age_min = school_type_by_age.age_range[0] age_max = school_type_by_age.age_range[1] school_type_age_ranges[school_type_by_age.school_type] = np.arange(age_min, age_max + 1) return school_type_age_ranges
[docs]def get_school_size_distr_by_type(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get the school size distribution by school types. If use_default, then we'll try to look for location specific data first, and if that's not available we'll use default data from the set default locations (see sp.defaults.py). This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location Returns: dict: A dictionary of school size distributions binned by size groups or brackets for each type of default school. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.school_size_distribution_by_type is None or len(location_data.school_size_distribution_by_type) == 0): return get_school_size_distr_by_type(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) school_size_distr_by_type = {} for dist_by_type in location_data.school_size_distribution_by_type: size_dist = dict(enumerate(dist_by_type.size_distribution)) school_size_distr_by_type[dist_by_type.school_type] = size_dist return school_size_distr_by_type
[docs]def get_employment_rates(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get employment rates by age. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in, which should be the 'usa' file_path (string) : file path to user specified employment by age data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of employment rates by age. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.employment_rates_by_age is None or len(location_data.employment_rates_by_age) == 0): return get_employment_rates(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) return dict(location_data.employment_rates_by_age)
[docs]def get_workplace_size_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get workplace size brackets. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in, which should be the 'usa' file_path (string) : file path to user specified workplace size brackets data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of workplace size brackets. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.workplace_size_counts_by_num_personnel is None or len(location_data.workplace_size_counts_by_num_personnel) == 0): return get_workplace_size_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) workplace_size_brackets = dict() for bracket_index, bracket in enumerate(location_data.workplace_size_counts_by_num_personnel): size_min = int(bracket[0]) size_max = int(bracket[1]) workplace_size_brackets[bracket_index] = np.arange(size_min, size_max + 1) return workplace_size_brackets
[docs]def get_workplace_size_distr_by_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get the distribution of workplace size by brackets. If use_default, then we'll first try to look for location specific data and if that's not available we'll use default data from settings.location, settings.state_location, settings.country_location. This may not be appropriate for the population under study so it's best to provide as much data as you can for the specific population. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified workplace size distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the distribution of workplace sizes by bracket. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.workplace_size_counts_by_num_personnel is None or len(location_data.workplace_size_counts_by_num_personnel) == 0): return get_workplace_size_distr_by_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) bracket_sizes = [ [bracket[0], bracket[1][2]] for bracket in enumerate(location_data.workplace_size_counts_by_num_personnel) ] dist = dict(bracket_sizes) return dist
[docs]def get_state_postal_code(state_location, country_location): """ Get the state postal code. Args: state_location (string) : name of the state country_location (string) : name of the country the state is in Return: str: A postal code for the state_location. """ file_path = os.path.join(defaults.settings.datadir, country_location, 'postal_codes.csv') df = pd.read_csv(file_path, delimiter=',') dic = dict(zip(df.state, df.postal_code)) return dic[state_location]
[docs]def get_long_term_care_facility_residents_distr(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get size distribution of residents per facility for Long Term Care Facilities. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified LTCF resident size distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the distribution of residents per facility for Long Term Care Facilities. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.ltcf_num_residents_distribution is None or len(location_data.ltcf_num_residents_distribution) == 0): return get_long_term_care_facility_residents_distr(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) bin_dist = [ [bracket[0], bracket[1][2]] for bracket in enumerate(location_data.ltcf_num_residents_distribution)] dist = dict(bin_dist) return dist
[docs]def get_long_term_care_facility_residents_distr_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get size bins for the distribution of residents per facility for Long Term Care Facilities. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in, which should be the 'usa' file_path (string) : file path to user specified LTCF resident size brackets data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of size brackets or bins for residents per facility. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.ltcf_num_residents_distribution is None or len(location_data.ltcf_num_residents_distribution) == 0): return get_long_term_care_facility_residents_distr_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) num_residents_brackets = dict() for bracket_index, bracket in enumerate(location_data.ltcf_num_residents_distribution): min_num_residents = int(bracket[0]) max_num_residents = int(bracket[1]) num_residents_brackets[bracket_index] = np.arange(min_num_residents, max_num_residents + 1) return num_residents_brackets
[docs]def get_long_term_care_facility_resident_to_staff_ratios_distr(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get size distribution of resident to staff ratios per facility for Long Term Care Facilities. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in file_path (string) : file path to user specified resident to staff ratio distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the distribution of residents per facility for Long Term Care Facilities. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.ltcf_resident_to_staff_ratio_distribution is None or len(location_data.ltcf_resident_to_staff_ratio_distribution) == 0): return get_long_term_care_facility_resident_to_staff_ratios_distr(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) bin_dist = [ [bracket[0], bracket[1][2]] for bracket in enumerate(location_data.ltcf_resident_to_staff_ratio_distribution)] dist = dict(bin_dist) return dist
[docs]def get_long_term_care_facility_resident_to_staff_ratios_brackets(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get size bins for the distribution of resident to staff ratios per facility for Long Term Care Facilities. Args: datadir (string) : file path to the data directory location (string) : name of the location state_location (string) : name of the state the location is in country_location (string) : name of the country the location is in, which should be the 'usa' file_path (string) : file path to user specified resident to staff ratio brackets data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of size brackets or bins for resident to staff ratios per facility. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.ltcf_resident_to_staff_ratio_distribution is None or len(location_data.ltcf_resident_to_staff_ratio_distribution) == 0): return get_long_term_care_facility_resident_to_staff_ratios_brackets(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) ltcf_ratio_brackets = dict() for bracket_index, bracket in enumerate(location_data.ltcf_resident_to_staff_ratio_distribution): size_min = bracket[0] size_max = bracket[1] ltcf_ratio_brackets[bracket_index] = np.arange(size_min, size_max + 1) return ltcf_ratio_brackets
[docs]def get_long_term_care_facility_use_rates(datadir=None, location=None, state_location=None, country_location=None, file_path=None, use_default=False): """ Get Long Term Care Facility use rates by age for a state. Args: datadir (str) : file path to the data directory location_alias (str) : more commonly known name of the location state_location (str) : name of the state the location is in country_location (str) : name of the country the location is in file_path (string) : file path to user specified gender by age bracket distribution data use_default (bool) : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from settings.location, settings.state_location, settings.country_location. Returns: dict: A dictionary of the Long Term Care Facility usage rates by age. Note: Currently only available for the United States. """ # Use default if no file for this location. location_data = load_location(location, state_location, country_location, revert_to_default=use_default) # Use default if no data for this parameter. if use_default and (location_data.ltcf_use_rate_distribution is None or len(location_data.ltcf_use_rate_distribution) == 0): return get_long_term_care_facility_use_rates(location=defaults.settings.location, state_location=defaults.settings.state_location, country_location=defaults.settings.country_location, use_default=False) dist = [[int(d[0]), d[1]] for d in location_data.ltcf_use_rate_distribution] return dict(dist)