Source code for emod_api.demographics.DemographicsGenerator

import csv
import json
from datetime import datetime
from enum import Enum
from typing import Optional, Union, List

import pandas as pd
import pdb

from emod_api.dtk_tools.demographics.DemographicsGeneratorConcern import (
    DemographicsGeneratorConcern,
    DemographicsGeneratorConcernChain,
)
from emod_api.demographics.Node import Node, nodeid_from_lat_lon
from emod_api.demographics.Demographics import Demographics
from emod_api.dtk_tools.support.General import init_logging

logger = init_logging("DemographicsGenerator")

# defaults section stored as module-variable, a dict (for now)
# structure will match file json structure initially, then be
# changed to a class with some abstraction from file format
defaults = {}

# Node class is used to store population data for spatial nodes
node_list = []

# Demographics is used to write the file to disk

CUSTOM_RESOLUTION = "custom"
DEFAULT_RESOLUTION = 30
VALID_RESOLUTIONS = {30: 30, 250: 250, CUSTOM_RESOLUTION: 30}

[docs]class InvalidResolution(BaseException): """ Custom Exception """ pass
[docs]class DemographicsType(Enum): """ Enum """ STATIC = "static" def __str__(self): return str(self.value)
[docs]def arcsec_to_deg(arcsec: float) -> float: """ Arc second to degrees Args: arcsec: arcsecond as float Returns: arc second converted to degrees """ return arcsec / 3600.0
[docs]def validate_res_in_arcsec(res_in_arcsec): """ Validate that the resolution is valid Args: res_in_arcsec: Resolution in arsecond. Supported values can be found in VALID_RESOLUTIONS Returns: None. Raise: KeyError: If the resolution is invalid, a key error is raised """ try: VALID_RESOLUTIONS[res_in_arcsec] except KeyError: raise InvalidResolution( f"{res_in_arcsec} is not a valid arcsecond resolution." f" Must be one of: {cls.VALID_RESOLUTIONS.keys()}" )
[docs]class DemographicsGenerator: """ Generates demographics file based on population input file. The population input file is csv with structure node_label\*, lat, lon, pop\* \*-ed columns are optional """ # mapping of requested arcsecond resolution -> demographic metadata arcsecond resolution. # All Hash values must be integers. def __init__( self, nodes, concerns: Optional[ Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]] ] = None, res_in_arcsec=CUSTOM_RESOLUTION, node_id_from_lat_long=False, ): """ Initialize the Demographics generator Args: nodes: list of nodes node_concern (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern demographics_concern (Optional[DemographicsGeneratorConcern]): Any concern generator we need to execute after the Demographics object has been generated, but not saved res_in_arcsec: Simulation grid resolution """ print( "Creating DemographicsGenerator instance." ) #self.nodes = nodes # currently only static is implemented in generate_nodes(self) self.demographics_type = ( DemographicsType.STATIC ) # could be 'static', 'growing' or a different type; self.node_id_from_lat_long = node_id_from_lat_long self.set_resolution(res_in_arcsec) if concerns and isinstance(concerns, list): concerns = DemographicsGeneratorConcernChain(*concerns) self.concerns = concerns # demographics data dictionary (working DTK demographics file when dumped as json) self.demographics = None
[docs] def set_resolution(self, res_in_arcsec): """ The canonical way to set arcsecond/degree resolutions on a DemographicsGenerator object. Verifies everything is set properly Args: res_in_arcsec: The requested resolution. e.g. 30, 250, 'custom' Returns: No return value. """ validate_res_in_arcsec(res_in_arcsec) self.resolution = res_in_arcsec self.res_in_arcsec = VALID_RESOLUTIONS[res_in_arcsec] self.res_in_degrees = arcsec_to_deg(self.res_in_arcsec) if logger: logger.debug( "Setting resolution to %s arcseconds (%s deg.) from selection: %s" % (self.res_in_arcsec, self.res_in_degrees, res_in_arcsec) )
[docs] def generate_nodes(self, defaults): """ generate demographics file nodes The process for generating nodes starts with looping through the loaded demographics nodes. For each node, we: 1. First determine the node's id. If the node has a forced id set, we use that. If we are using a custom resolution, we use the index(ie 1, 2, 3...). Lastly, we build the node id from the lat and lon id of the node 2. We then start to populate the node_attributes and individual attributes for the current node. The node_attributes will have data loaded from the initial nodes fed into DemographicsGenerator. The individual attributes start off as an empty dict. 3. We next determine the birthrate for the node. If the node attributes contains a Country element, we first lookup the birthrate from the World Pop data. We then build a MortalityDistribution configuration with country specific configuration elements and add that to the individual attributes. If there is no Country element in the node attributes, we set the birth rate to the default_birth_rate. This value was set in initialization of the DemographicsGenerator to the birth rate of the specified country from the world pop data 4. We then calculate the per_node_birth_rate using get_per_node_birth_rate and then set the birth rate on the node attributes 5. We then calculate the equilibrium_age_distribution and use that to create the AgeDistribution in individual_attributes 6. We then add each new demographic node to a list to end returned at the end of the function """ print( "Generating demographics nodes." ) nodes = [] # a list of dictionaries ('NodeID', "NodeAttributes', 'I...A...') we return def generate_node_id( i, node ): node_id = None if node.forced_id: node_id = node.forced_id elif self.node_id_from_lat_long: node_id = nodeid_from_lat_lon( float(node.lat), float(node.lon), self.res_in_degrees ) else: node_id = i + 1 return node_id for i, node in enumerate(node_list): # if res_in_degrees is custom assume node_ids are generated for a household-like setup #and not based on lat/lon node_id = generate_node_id( i, node ) node_attributes = node.to_dict() individual_attributes = {} # Run our model through our Concern Set # UPDATE: NOT doing this anymore if self.concerns: self.concerns.update_node( defaults, node, node_attributes, individual_attributes ) print( f"Adding node {node_id}." ) nodes.append( { "NodeID": node_id, "NodeAttributes": node_attributes, "IndividualAttributes": individual_attributes, } ) return nodes
@staticmethod def __to_grid_file( grid_file_name, demographics, include_attributes: Optional[List[str]] = None, node_attributes: Optional[List[str]] = None, ): """ Convert a demographics object(Full object represented as a nested dictionary) to a grid file Args: grid_file_name: Name of grid file to save demographics: Demographics object include_attributes: Attributes to include in export node_attributes: Optional list of attributes from the NodeAttributes path to include Returns: """ node_attrs = ["Latitude", "Longitude", "InitialPopulation"] if include_attributes is None: include_attributes = [] rows = [] for node in demographics["Nodes"]: row = { k: v for k, v in node.items() if k in include_attributes or k in node_attrs } if node_attributes and "NodeAttributes" in row: other = { k: v for k, v in row["NodeAttributes"].items() if k in node_attributes } row.update(other) rows.append(row) pd.DataFrame(rows).to_csv(grid_file_name)
[docs] def generate_metadata(self): """ generate demographics file metadata """ if self.resolution == CUSTOM_RESOLUTION: reference_id = "Custom user" else: reference_id = "Gridded world grump%darcsec" % self.res_in_arcsec metadata = { "Author": "idm", "Tool": "dtk-tools", "IdReference": reference_id, "DateCreated": str(datetime.now()), "NodeCount": len(node_list), "Resolution": int(self.res_in_arcsec), } return metadata
[docs] def generate_demographics(self): """ return all demographics file components in a single dictionary; a valid DTK demographics file when dumped as json """ print( "Generating demographics dictionary from nodes and defaults." ) if self.concerns: self.concerns.update_defaults(defaults) nodes = self.generate_nodes(defaults) self.demographics = { "Nodes": nodes, "Defaults": defaults, "Metadata": self.generate_metadata(), } return self.demographics
# MOVE TO demographics/DemographicsInputDataParsers.py
[docs]def from_dataframe( df, demographics_filename: Optional[str] = None, concerns: Optional[ Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]] ] = None, res_in_arcsec=CUSTOM_RESOLUTION, node_id_from_lat_long=True, default_population: int = 1000, load_other_columns_as_attributes=False, include_columns: Optional[List[str]] = None, exclude_columns: Optional[List[str]] = None, nodeid_column_name: Optional[str] = None, latitude_column_name: str = "lat", longitude_column_name: str = "lon", population_column_name: str = "pop", ): """ Generates a demographics file from a dataframe Args: df: pandas DataFrame containing demographics information. Must contain all the columns specified by latitude_column_name, longitude_column_name. The population_column_name is optional. If not found, we fall back to default_population demographics_filename: demographics file to save the demographics file too. This is optional concerns (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern res_in_arcsec: Resolution in Arcseconds node_id_from_lat_long: Determine if we should calculate the node id from the lat long. By default this is true unless you also set res_in_arcsec to CUSTOM_RESOLUTION. When not using lat/long for ids, the first fallback it to check the node for a forced id. If that is not found, we assign it an index as id load_other_columns_as_attributes: Load additional columns from a csv file as node attributes include_columns: A list of columns that should be added as node attributes from the csv file. To be used in conjunction with load_other_columns_as_attributes. exclude_columns: A list of columns that should be ignored as attributes when load_other_columns_as_attributes is enabled. This cannot be combined with include_columns default_population: Default population. Only used if population_column_name does not exist nodeid_column_name: Column name to load nodeid values from latitude_column_name: Column name to load latitude values from longitude_column_name: Column name to load longitude values from population_column_name: Column name to load population values from Returns: demographics file as a dictionary """ print( "from_dataframe: Reading data." ) warn_no_pop = False validate_res_in_arcsec(res_in_arcsec) res_in_deg = arcsec_to_deg(VALID_RESOLUTIONS[res_in_arcsec]) if latitude_column_name not in df.columns.values: raise ValueError( f"Column {latitude_column_name} is required in input population file." ) if longitude_column_name not in df.columns.values: raise ValueError( f"Column {longitude_column_name} is required in input population file." ) if not warn_no_pop and population_column_name not in df.columns.values: warn_no_pop = True logger.warning( f"Could not locate population column{population_column_name}. Using the default " f"population value of {default_population}" ) df[population_column_name] = default_population else: df[population_column_name] = df[population_column_name].astype(int) if not node_id_from_lat_long and not nodeid_column_name: logger.warning(f"NodeID column not specified. Reverting to csv index + 1") df["node_label"] = df.index + 1 if node_id_from_lat_long and "node_label" not in df.columns.values: df["node_label"] = df.apply( lambda x: nodeid_from_lat_lon( x[latitude_column_name], x[longitude_column_name], res_in_deg ), axis=1, ) if include_columns: include_columns_verified = [ x for x in include_columns if x in df.columns.values ] include_columns = include_columns_verified for r, row in df.iterrows(): extra_attrs = {} if load_other_columns_as_attributes: if include_columns: extra_attrs = {x: row[x] for x in include_columns} elif exclude_columns: exclude_columns += [ latitude_column_name, longitude_column_name, population_column_name, nodeid_column_name, ] extra_attrs = { x: row[x] for x in df.columns.values if x not in exclude_columns } node_label = nodeid_column_name if nodeid_column_name else "node_label" # Append the newly created node to the list node_list.append( Node( row[latitude_column_name], row[longitude_column_name], row[population_column_name], forced_id=row[node_label], extra_attributes=extra_attrs, #name=int(node_ID_from_lat_long(lat, lon, res)), ) ) # node_list now exists -- what about defaults? # Option 1 to write df = Demographics( nodes=node_list ) df.generate_file( demographics_filename+"_DF" ) # Option 2 to write if demographics_filename: # why would this be left unset? use case? # this is kind of ugly; we're inside a state class function, and creating # instance of the class just so we can call generate_demographics on it. # pretty sure we can do this all with static everything which kind of # eliminates need for class, just do as module variables. demo = DemographicsGenerator( node_list, concerns=concerns, res_in_arcsec=res_in_arcsec, node_id_from_lat_long=node_id_from_lat_long, ) demographics = demo.generate_demographics() print( f"Writing {demographics_filename}." ) with open(demographics_filename, "w+") as demo_f: json.dump(demographics, demo_f, indent=4, sort_keys=True) else: print( "demographics_filename was not defined. Not written." ) return demographics
# MOVE TO demographics/DemographicsInputDataParsers.py
[docs]def from_file( population_input_file: str, demographics_filename: Optional[str] = None, concerns: Optional[ Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]] ] = None, res_in_arcsec=CUSTOM_RESOLUTION, node_id_from_lat_long=True, default_population: int = 1000, load_other_columns_as_attributes=False, include_columns: Optional[List[str]] = None, exclude_columns: Optional[List[str]] = None, nodeid_column_name: Optional[str] = None, latitude_column_name: str = "lat", longitude_column_name: str = "lon", population_column_name: str = "pop", ): """ Generates a demographics file from a CSV population Args: population_input_file: CSV population file. Must contain all the columns specified by latitude_column_name, longitude_column_name. The population_column_name is optional. If not found, we fall back to default_population demographics_filename: demographics file to save the demographics file too. This is optional concerns (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern res_in_arcsec: Resolution in Arcseconds node_id_from_lat_long: Determine if we should calculate the node id from the lat long. By default this is true unless you also set res_in_arcsec to CUSTOM_RESOLUTION. When not using lat/long for ids, the first fallback it to check the node for a forced id. If that is not found, we assign it an index as id load_other_columns_as_attributes: Load additional columns from a csv file as node attributes include_columns: A list of columns that should be added as node attributes from the csv file. To be used in conjunction with load_other_columns_as_attributes. exclude_columns: A list of columns that should be ignored as attributes when load_other_columns_as_attributes is enabled. This cannot be combined with include_columns default_population: Default population. Only used if population_column_name does not exist nodeid_column_name: Column name to load nodeid values from latitude_column_name: Column name to load latitude values from longitude_column_name: Column name to load longitude values from population_column_name: Column name to load population values from Returns: demographics file as a dictionary """ print( "from_gridfile: Reading data." ) df = pd.read_csv(population_input_file) return from_dataframe( df, demographics_filename=demographics_filename, concerns=concerns, res_in_arcsec=res_in_arcsec, node_id_from_lat_long=node_id_from_lat_long, default_population=default_population, load_other_columns_as_attributes=load_other_columns_as_attributes, include_columns=include_columns, exclude_columns=exclude_columns, nodeid_column_name=nodeid_column_name, latitude_column_name=latitude_column_name, longitude_column_name=longitude_column_name, population_column_name=population_column_name, )
""" from_gridfile from_dataframe __init__ set_resolution generate_demographics generate_nodes generate_metadata """