import csv
import json
from datetime import datetime
from enum import Enum
from typing import Optional, Union, List
import pandas as pd
import pdb
from emod_api.dtk_tools.demographics.DemographicsGeneratorConcern import (
DemographicsGeneratorConcern,
DemographicsGeneratorConcernChain,
)
from emod_api.demographics.Node import Node, nodeid_from_lat_lon
from emod_api.demographics.Demographics import Demographics
from emod_api.dtk_tools.support.General import init_logging
logger = init_logging("DemographicsGenerator")
# defaults section stored as module-variable, a dict (for now)
# structure will match file json structure initially, then be
# changed to a class with some abstraction from file format
defaults = {}
# Node class is used to store population data for spatial nodes
node_list = []
# Demographics is used to write the file to disk
CUSTOM_RESOLUTION = "custom"
DEFAULT_RESOLUTION = 30
VALID_RESOLUTIONS = {30: 30, 250: 250, CUSTOM_RESOLUTION: 30}
[docs]class InvalidResolution(BaseException):
"""
Custom Exception
"""
pass
[docs]class DemographicsType(Enum):
"""
Enum
"""
STATIC = "static"
def __str__(self):
return str(self.value)
[docs]def arcsec_to_deg(arcsec: float) -> float:
"""
Arc second to degrees
Args:
arcsec: arcsecond as float
Returns:
arc second converted to degrees
"""
return arcsec / 3600.0
[docs]def validate_res_in_arcsec(res_in_arcsec):
"""
Validate that the resolution is valid
Args:
res_in_arcsec: Resolution in arsecond. Supported values can be found in VALID_RESOLUTIONS
Returns:
None.
Raise:
KeyError: If the resolution is invalid, a key error is raised
"""
try:
VALID_RESOLUTIONS[res_in_arcsec]
except KeyError:
raise InvalidResolution(
f"{res_in_arcsec} is not a valid arcsecond resolution."
f" Must be one of: {cls.VALID_RESOLUTIONS.keys()}"
)
[docs]class DemographicsGenerator:
"""
Generates demographics file based on population input file.
The population input file is csv with structure
node_label\*, lat, lon, pop\*
\*-ed columns are optional
"""
# mapping of requested arcsecond resolution -> demographic metadata arcsecond resolution.
# All Hash values must be integers.
def __init__(
self,
nodes,
concerns: Optional[
Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]]
] = None,
res_in_arcsec=CUSTOM_RESOLUTION,
node_id_from_lat_long=False,
):
"""
Initialize the Demographics generator
Args:
nodes: list of nodes
node_concern (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should
we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern
demographics_concern (Optional[DemographicsGeneratorConcern]): Any concern generator we need to execute
after the Demographics object has been generated, but not saved
res_in_arcsec: Simulation grid resolution
"""
print( "Creating DemographicsGenerator instance." )
#self.nodes = nodes
# currently only static is implemented in generate_nodes(self)
self.demographics_type = (
DemographicsType.STATIC
) # could be 'static', 'growing' or a different type;
self.node_id_from_lat_long = node_id_from_lat_long
self.set_resolution(res_in_arcsec)
if concerns and isinstance(concerns, list):
concerns = DemographicsGeneratorConcernChain(*concerns)
self.concerns = concerns
# demographics data dictionary (working DTK demographics file when dumped as json)
self.demographics = None
[docs] def set_resolution(self, res_in_arcsec):
"""
The canonical way to set arcsecond/degree resolutions on a DemographicsGenerator object. Verifies everything
is set properly
Args:
res_in_arcsec: The requested resolution. e.g. 30, 250, 'custom'
Returns: No return value.
"""
validate_res_in_arcsec(res_in_arcsec)
self.resolution = res_in_arcsec
self.res_in_arcsec = VALID_RESOLUTIONS[res_in_arcsec]
self.res_in_degrees = arcsec_to_deg(self.res_in_arcsec)
if logger:
logger.debug(
"Setting resolution to %s arcseconds (%s deg.) from selection: %s"
% (self.res_in_arcsec, self.res_in_degrees, res_in_arcsec)
)
[docs] def generate_nodes(self, defaults):
"""
generate demographics file nodes
The process for generating nodes starts with looping through the loaded demographics nodes. For each node,
we:
1. First determine the node's id. If the node has a forced id set, we use that. If we are
using a custom resolution, we use the index(ie 1, 2, 3...). Lastly, we build the node id
from the lat and lon id of the node
2. We then start to populate the node_attributes and individual attributes for the current
node. The node_attributes will have data loaded from the initial nodes fed into
DemographicsGenerator. The individual attributes start off as an empty dict.
3. We next determine the birthrate for the node. If the node attributes contains a Country
element, we first lookup the birthrate from the World Pop data. We then build a
MortalityDistribution configuration with country specific configuration elements and add
that to the individual attributes. If there is no Country element in the node attributes,
we set the birth rate to the default_birth_rate. This value was set in initialization of the
DemographicsGenerator to the birth rate of the specified country from the world pop data
4. We then calculate the per_node_birth_rate using get_per_node_birth_rate and then set the
birth rate on the node attributes
5. We then calculate the equilibrium_age_distribution and use that to create the
AgeDistribution in individual_attributes
6. We then add each new demographic node to a list to end returned at the end of the function
"""
print( "Generating demographics nodes." )
nodes = [] # a list of dictionaries ('NodeID', "NodeAttributes', 'I...A...') we return
def generate_node_id( i, node ):
node_id = None
if node.forced_id:
node_id = node.forced_id
elif self.node_id_from_lat_long:
node_id = nodeid_from_lat_lon(
float(node.lat), float(node.lon), self.res_in_degrees
)
else:
node_id = i + 1
return node_id
for i, node in enumerate(node_list):
# if res_in_degrees is custom assume node_ids are generated for a household-like setup
#and not based on lat/lon
node_id = generate_node_id( i, node )
node_attributes = node.to_dict()
individual_attributes = {}
# Run our model through our Concern Set
# UPDATE: NOT doing this anymore
if self.concerns:
self.concerns.update_node(
defaults, node, node_attributes, individual_attributes
)
print( f"Adding node {node_id}." )
nodes.append(
{
"NodeID": node_id,
"NodeAttributes": node_attributes,
"IndividualAttributes": individual_attributes,
}
)
return nodes
@staticmethod
def __to_grid_file(
grid_file_name,
demographics,
include_attributes: Optional[List[str]] = None,
node_attributes: Optional[List[str]] = None,
):
"""
Convert a demographics object(Full object represented as a nested dictionary) to a grid file
Args:
grid_file_name: Name of grid file to save
demographics: Demographics object
include_attributes: Attributes to include in export
node_attributes: Optional list of attributes from the NodeAttributes path to include
Returns:
"""
node_attrs = ["Latitude", "Longitude", "InitialPopulation"]
if include_attributes is None:
include_attributes = []
rows = []
for node in demographics["Nodes"]:
row = {
k: v
for k, v in node.items()
if k in include_attributes or k in node_attrs
}
if node_attributes and "NodeAttributes" in row:
other = {
k: v
for k, v in row["NodeAttributes"].items()
if k in node_attributes
}
row.update(other)
rows.append(row)
pd.DataFrame(rows).to_csv(grid_file_name)
[docs] def generate_demographics(self):
"""
return all demographics file components in a single dictionary; a valid DTK demographics file when dumped as json
"""
print( "Generating demographics dictionary from nodes and defaults." )
if self.concerns:
self.concerns.update_defaults(defaults)
nodes = self.generate_nodes(defaults)
self.demographics = {
"Nodes": nodes,
"Defaults": defaults,
"Metadata": self.generate_metadata(),
}
return self.demographics
# MOVE TO demographics/DemographicsInputDataParsers.py
[docs]def from_dataframe(
df,
demographics_filename: Optional[str] = None,
concerns: Optional[
Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]]
] = None,
res_in_arcsec=CUSTOM_RESOLUTION,
node_id_from_lat_long=True,
default_population: int = 1000,
load_other_columns_as_attributes=False,
include_columns: Optional[List[str]] = None,
exclude_columns: Optional[List[str]] = None,
nodeid_column_name: Optional[str] = None,
latitude_column_name: str = "lat",
longitude_column_name: str = "lon",
population_column_name: str = "pop",
):
"""
Generates a demographics file from a dataframe
Args:
df: pandas DataFrame containing demographics information. Must contain all the columns specified by latitude_column_name,
longitude_column_name. The population_column_name is optional. If not found, we fall back to default_population
demographics_filename: demographics file to save the demographics file too. This is optional
concerns (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should
we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern
res_in_arcsec: Resolution in Arcseconds
node_id_from_lat_long: Determine if we should calculate the node id from the lat long. By default this is
true unless you also set res_in_arcsec to CUSTOM_RESOLUTION. When not using lat/long for ids, the first
fallback it to check the node for a forced id. If that is not found, we assign it an index as id
load_other_columns_as_attributes: Load additional columns from a csv file as node attributes
include_columns: A list of columns that should be added as node attributes from the csv file. To be used in
conjunction with load_other_columns_as_attributes.
exclude_columns: A list of columns that should be ignored as attributes when
load_other_columns_as_attributes is enabled. This cannot be combined with include_columns
default_population: Default population. Only used if population_column_name does not exist
nodeid_column_name: Column name to load nodeid values from
latitude_column_name: Column name to load latitude values from
longitude_column_name: Column name to load longitude values from
population_column_name: Column name to load population values from
Returns:
demographics file as a dictionary
"""
print( "from_dataframe: Reading data." )
warn_no_pop = False
validate_res_in_arcsec(res_in_arcsec)
res_in_deg = arcsec_to_deg(VALID_RESOLUTIONS[res_in_arcsec])
if latitude_column_name not in df.columns.values:
raise ValueError(
f"Column {latitude_column_name} is required in input population file."
)
if longitude_column_name not in df.columns.values:
raise ValueError(
f"Column {longitude_column_name} is required in input population file."
)
if not warn_no_pop and population_column_name not in df.columns.values:
warn_no_pop = True
logger.warning(
f"Could not locate population column{population_column_name}. Using the default "
f"population value of {default_population}"
)
df[population_column_name] = default_population
else:
df[population_column_name] = df[population_column_name].astype(int)
if not node_id_from_lat_long and not nodeid_column_name:
logger.warning(f"NodeID column not specified. Reverting to csv index + 1")
df["node_label"] = df.index + 1
if node_id_from_lat_long and "node_label" not in df.columns.values:
df["node_label"] = df.apply(
lambda x: nodeid_from_lat_lon(
x[latitude_column_name], x[longitude_column_name], res_in_deg
),
axis=1,
)
if include_columns:
include_columns_verified = [
x for x in include_columns if x in df.columns.values
]
include_columns = include_columns_verified
for r, row in df.iterrows():
extra_attrs = {}
if load_other_columns_as_attributes:
if include_columns:
extra_attrs = {x: row[x] for x in include_columns}
elif exclude_columns:
exclude_columns += [
latitude_column_name,
longitude_column_name,
population_column_name,
nodeid_column_name,
]
extra_attrs = {
x: row[x] for x in df.columns.values if x not in exclude_columns
}
node_label = nodeid_column_name if nodeid_column_name else "node_label"
# Append the newly created node to the list
node_list.append(
Node(
row[latitude_column_name],
row[longitude_column_name],
row[population_column_name],
forced_id=row[node_label],
extra_attributes=extra_attrs,
#name=int(node_ID_from_lat_long(lat, lon, res)),
)
)
# node_list now exists -- what about defaults?
# Option 1 to write
df = Demographics( nodes=node_list )
df.generate_file( demographics_filename+"_DF" )
# Option 2 to write
if demographics_filename: # why would this be left unset? use case?
# this is kind of ugly; we're inside a state class function, and creating
# instance of the class just so we can call generate_demographics on it.
# pretty sure we can do this all with static everything which kind of
# eliminates need for class, just do as module variables.
demo = DemographicsGenerator(
node_list,
concerns=concerns,
res_in_arcsec=res_in_arcsec,
node_id_from_lat_long=node_id_from_lat_long,
)
demographics = demo.generate_demographics()
print( f"Writing {demographics_filename}." )
with open(demographics_filename, "w+") as demo_f:
json.dump(demographics, demo_f, indent=4, sort_keys=True)
else:
print( "demographics_filename was not defined. Not written." )
return demographics
# MOVE TO demographics/DemographicsInputDataParsers.py
[docs]def from_file(
population_input_file: str,
demographics_filename: Optional[str] = None,
concerns: Optional[
Union[DemographicsGeneratorConcern, List[DemographicsGeneratorConcern]]
] = None,
res_in_arcsec=CUSTOM_RESOLUTION,
node_id_from_lat_long=True,
default_population: int = 1000,
load_other_columns_as_attributes=False,
include_columns: Optional[List[str]] = None,
exclude_columns: Optional[List[str]] = None,
nodeid_column_name: Optional[str] = None,
latitude_column_name: str = "lat",
longitude_column_name: str = "lon",
population_column_name: str = "pop",
):
"""
Generates a demographics file from a CSV population
Args:
population_input_file: CSV population file. Must contain all the columns specified by latitude_column_name,
longitude_column_name. The population_column_name is optional. If not found, we fall back to default_population
demographics_filename: demographics file to save the demographics file too. This is optional
concerns (Optional[DemographicsNodeGeneratorConcern]): What DemographicsNodeGeneratorConcern should
we apply. If not specified, we use the DefaultWorldBankEquilibriumConcern
res_in_arcsec: Resolution in Arcseconds
node_id_from_lat_long: Determine if we should calculate the node id from the lat long. By default this is
true unless you also set res_in_arcsec to CUSTOM_RESOLUTION. When not using lat/long for ids, the first
fallback it to check the node for a forced id. If that is not found, we assign it an index as id
load_other_columns_as_attributes: Load additional columns from a csv file as node attributes
include_columns: A list of columns that should be added as node attributes from the csv file. To be used in
conjunction with load_other_columns_as_attributes.
exclude_columns: A list of columns that should be ignored as attributes when
load_other_columns_as_attributes is enabled. This cannot be combined with include_columns
default_population: Default population. Only used if population_column_name does not exist
nodeid_column_name: Column name to load nodeid values from
latitude_column_name: Column name to load latitude values from
longitude_column_name: Column name to load longitude values from
population_column_name: Column name to load population values from
Returns:
demographics file as a dictionary
"""
print( "from_gridfile: Reading data." )
df = pd.read_csv(population_input_file)
return from_dataframe(
df,
demographics_filename=demographics_filename,
concerns=concerns,
res_in_arcsec=res_in_arcsec,
node_id_from_lat_long=node_id_from_lat_long,
default_population=default_population,
load_other_columns_as_attributes=load_other_columns_as_attributes,
include_columns=include_columns,
exclude_columns=exclude_columns,
nodeid_column_name=nodeid_column_name,
latitude_column_name=latitude_column_name,
longitude_column_name=longitude_column_name,
population_column_name=population_column_name,
)
"""
from_gridfile
from_dataframe
__init__
set_resolution
generate_demographics
generate_nodes
generate_metadata
"""