Source code for emodpy_hiv.demographics.un_world_pop
"""
This module contains methods for extracting data from UN World Population files.
https://population.un.org/wpp/
These methods output dataframes that can be used to initialize Demographic objects in EMOD.
"""
from typing import Union
from pathlib import Path
from importlib import resources
import pandas as pd
from emodpy_hiv.demographics.year_age_rate import YearAgeRate
import emodpy_hiv.countries.un_world_pop_data as un_data
POPULATION_FILES = {
"2012": "WPP2012_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.XLS",
"2015": "WPP2015_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.XLS",
"2019": "WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx",
"2024": "WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx"
}
FERTILITY_FILES = {
"2012": "WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY.xlsx",
"2015": "WPP2015_FERT_F07_AGE_SPECIFIC_FERTILITY.XLS",
"2019": "WPP2019_FERT_F07_AGE_SPECIFIC_FERTILITY.xlsx",
"2024": "WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER.xlsx"
}
MORTALITY_FILES = {
"2012": {
"male": "WPP2012_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
"female": "WPP2012_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
},
"2015": {
"male": "WPP2015_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.XLS",
"female": "WPP2015_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.XLS"
},
"2019": {
"male": "WPP2019_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
"female": "WPP2019_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
},
"2024": {
"male": "WPP2024_MORT_F07_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
"female": "WPP2024_MORT_F07_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
}
}
KNOWN_VERSIONS = ["2012", "2015", "2019", "2024"]
KNOWN_GENDERS = ["male", "female"]
def _check_version(version):
if version not in KNOWN_VERSIONS:
supported_versions = ""
for index, v in enumerate(KNOWN_VERSIONS):
supported_versions += v
if (index + 1) < len(KNOWN_VERSIONS):
supported_versions += ", "
raise ValueError(f"'version'= '{version}' is not supported.\nOnly {supported_versions} formats are supported.")
def _check_gender(gender):
if gender not in KNOWN_GENDERS:
raise ValueError(f"'gender'= '{gender}' is not supported.\nOnly 'male' and 'female' are supported.")
def _get_population_filename(version):
_check_version(version)
un_data_root = resources.files(un_data)
filename = Path(un_data_root, POPULATION_FILES[version])
return filename
def _get_fertility_filename(version):
_check_version(version)
un_data_root = resources.files(un_data)
filename = Path(un_data_root, FERTILITY_FILES[version])
return filename
def _get_mortality_filename(version, gender):
_check_version(version)
_check_gender(gender)
un_data_root = resources.files(un_data)
filename = Path(un_data_root, MORTALITY_FILES[version][gender])
return filename
def _check_country(country, possible_countries, filename):
if country not in possible_countries:
raise ValueError(f"'country'= '{country}' is not supported.\nThe file,\n{filename},\nonly supports the following countries:\n{possible_countries}")
def _check_filename(filename):
if ((isinstance(filename, Path) and not filename.exists()) or
(isinstance(filename, str) and not Path(filename).exists())):
raise ValueError(f"The file does not exist.\n{filename}")
def _check_years(years, possible_years, filename):
for year in years:
if year not in possible_years:
raise ValueError(f"'year'= '{year}' is not supported.\nThe file,\n{filename},\nonly supports the following years:\n{possible_years}")
[docs]def extract_population_by_age(country: str,
version: str,
years: list[float],
filename: Union[str,Path] = None):
"""
This code is for extracting population by age data from the files
downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Population/).
The code assumes that the files is in Strict Open XML Spreadsheet format. This format require
us to use the 'calamine' engine to read the file. The file is expected to be for both sexes
and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx,
WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)
Args:
country:
The name of the country used in the spreadsheet for which you want to extract the data.
version:
A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024
years:
A list of years to get data for
NOTE: Data files say it is for July 1st of the given year.
filename:
If not provided, the 'version' will be used to select from the known versions. If a filename
is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
but the data will be extracted from the first two. The 'estimates' sheet gives you data for the
past while the 'medium variant' sheet gives you the data for the future. It is expected to be in
the Strict Open XML Spreadsheet format.
Return
It will return a pandas DataFrame where each row is a year and the columns are for an age range
"""
if filename is None:
filename = _get_population_filename(version)
else:
_check_filename(filename)
# ---------------------------------------------------------
# --- Define column names and sheets to read the data from.
# ---------------------------------------------------------
COUNTRY_COL = "Major area, region, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
AGE_00_04 = "0-4" # noqa: E221
AGE_05_09 = "5-9" # noqa: E221
AGE_10_14 = "10-14" # noqa: E221
AGE_15_19 = "15-19" # noqa: E221
AGE_20_24 = "20-24" # noqa: E221
AGE_25_29 = "25-29" # noqa: E221
AGE_30_34 = "30-34" # noqa: E221
AGE_35_39 = "35-39" # noqa: E221
AGE_40_44 = "40-44" # noqa: E221
AGE_45_49 = "45-49" # noqa: E221
AGE_50_54 = "50-54" # noqa: E221
AGE_55_59 = "55-59" # noqa: E221
AGE_60_64 = "60-64" # noqa: E221
AGE_65_69 = "65-69" # noqa: E221
AGE_70_74 = "70-74" # noqa: E221
AGE_75_79 = "75-79" # noqa: E221
AGE_80_84 = "80-84" # noqa: E221
AGE_85_89 = "85-89" # noqa: E221
AGE_90_94 = "90-94" # noqa: E221
AGE_95_99 = "95-99" # noqa: E221
AGE_100 = "100+" # noqa: E221
AGE_80_PLUS = "80+" # noqa: E221
sheet_list = []
age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19,
AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39,
AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59,
AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79,
AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99,
AGE_100]
medium_cols = age_cols
if version == "2012":
sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"]
COUNTRY_COL = "Major area, region, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
medium_cols = age_cols.copy()
age_cols.append(AGE_80_PLUS)
elif version == "2015":
sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
COUNTRY_COL = "Major area, region, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
medium_cols = age_cols.copy()
age_cols.append(AGE_80_PLUS)
elif version == "2019":
sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
elif version == "2024":
sheet_list = ["Estimates", "Medium variant"]
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Year" # noqa: E221
else:
raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))
# --------------------------------
# --- Extract data from the sheets
# --------------------------------
df = pd.DataFrame()
for sheet in sheet_list:
cols_to_read = [COUNTRY_COL, PERIOD_COL]
if "estimate" in sheet.lower():
cols_to_read.extend(age_cols)
else:
cols_to_read.extend(medium_cols)
df_sheet = pd.read_excel(filename,
sheet_name=sheet,
skiprows=16,
usecols=cols_to_read,
engine="calamine")
_check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
df = pd.concat([df, df_sheet])
df = df.drop(COUNTRY_COL, axis=1)
# -----------------------------------
# --- Get the rows of data we want
# -----------------------------------
_check_years(years, df[PERIOD_COL].unique(), filename)
df = df[df[PERIOD_COL].isin(years)]
# ---------------------------------------------------------------
# --- In the 2012, 2015 data, the Estimates sheet has this "80+" column
# --- that can be used instead of the 80-84, etc columns.
# --- NOTE: This is crappy logic, but I need to move on.
# ---------------------------------------------------------------
if version == "2012" or version == "2015":
val = df[AGE_80_PLUS].iloc[0]
has_80p = isinstance(val, float) or isinstance(val, int)
if has_80p:
df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1)
for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]:
age_cols.remove(col)
else:
df = df.drop(AGE_80_PLUS, axis=1)
age_cols.remove(AGE_80_PLUS)
# -----------------------------------------------------------
# --- rename the age columns so they are just the minimum age
# -----------------------------------------------------------
new_col_names = {}
for col in age_cols:
if col == AGE_80_PLUS:
new_col_names[col] = "80"
elif len(col) == 3:
new_col_names[col] = col[0:1]
elif len(col) == 5:
new_col_names[col] = col[0:2]
else:
new_col_names[col] = col[0:3]
df.rename(columns=new_col_names, inplace=True)
# ------------------------------------------------------------------
# --- Some columns can have the "...". This changes those to zeros.
# --- (This could go away with the 80+ handling.)
# ------------------------------------------------------------------
#df[year] = ( pd.to_numeric(df[year], errors='coerce' ).fillna(0) )
# -----------------------------------------------------------------
# --- The data is in thousands of people so multiply by 1000
# --- and convert to integer since we want the value as an integer.
# -----------------------------------------------------------------
for column in df.columns:
if column != PERIOD_COL:
df[column] = df[column] * 1000
return df
[docs]def extract_population_by_age_for_ingest_form(filename, country, version, years, gender):
df = extract_population_by_age(filename=filename, country=country, version=version, years=years)
ret_df = pd.DataFrame()
ret_df["Year" ] = 0 # noqa: E202
ret_df["Gender" ] = 0 # noqa: E202
ret_df["AgeBin" ] = 0 # noqa: E202
ret_df["Population"] = 0
for col_name in df.columns:
if col_name != "Reference date (as of 1 July)":
for index, value in df[col_name].items():
year = df["Reference date (as of 1 July)"][index]
age_bin = f"[{str(col_name)}:{str(int(col_name)+5)})"
ret_df.loc[len(ret_df)] = [year, gender, age_bin, value]
return ret_df
[docs]def extract_population_by_age_and_distribution(country: str,
version: str,
year: int = 1960,
filename: Union[str, Path] = None):
"""
This code is for extracting population by age data from the files
downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Population/).
The code assumes that the files is in Strict Open XML Spreadsheet format. This format require
us to use the 'calamine' engine to read the file. The file is expected to be for both sexes
and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx,
WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)
Args:
country:
The name of the country used in the spreadsheet for which you want to extract the data.
version:
A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024
year:
The year in the data to get the total population and age distribution for. Default is 1960
NOTE: Data files say it is for July 1st of the given year.
filename:
If not provided, the 'version' will be used to select from the known versions. If a filename
is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
but the data will be only be extracted from the first sheet, 'estimates'. The 'estimates' sheet
gives you data for the past. It is assumed that you are using this function to get the starting
point of an EMOD simulation that starts in the past. It is expected to be in the Strict Open XML
Spreadsheet format.
Return:
It will return the total population for the given year PLUS a YearAgeRate object where
the "rate" column contains the fraction of people in that particular year and age ranges.
"""
if filename is None:
filename = _get_population_filename(version)
else:
_check_filename(filename)
# ---------------------------------------------------------
# --- Define column names and sheets to read the data from.
# ---------------------------------------------------------
COUNTRY_COL = "Major area, region, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
AGE_00_04 = "0-4" # noqa: E221
AGE_05_09 = "5-9" # noqa: E221
AGE_10_14 = "10-14" # noqa: E221
AGE_15_19 = "15-19" # noqa: E221
AGE_20_24 = "20-24" # noqa: E221
AGE_25_29 = "25-29" # noqa: E221
AGE_30_34 = "30-34" # noqa: E221
AGE_35_39 = "35-39" # noqa: E221
AGE_40_44 = "40-44" # noqa: E221
AGE_45_49 = "45-49" # noqa: E221
AGE_50_54 = "50-54" # noqa: E221
AGE_55_59 = "55-59" # noqa: E221
AGE_60_64 = "60-64" # noqa: E221
AGE_65_69 = "65-69" # noqa: E221
AGE_70_74 = "70-74" # noqa: E221
AGE_75_79 = "75-79" # noqa: E221
AGE_80_84 = "80-84" # noqa: E221
AGE_85_89 = "85-89" # noqa: E221
AGE_90_94 = "90-94" # noqa: E221
AGE_95_99 = "95-99" # noqa: E221
AGE_100 = "100+" # noqa: E221
AGE_80_PLUS = "80+" # noqa: E221
sheet_list = []
age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19,
AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39,
AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59,
AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79,
AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99,
AGE_100]
if version == "2012" or version == "2015":
sheet_list = ["ESTIMATES"] # "MEDIUM FERTILITY"
COUNTRY_COL = "Major area, region, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
age_cols.append(AGE_80_PLUS)
elif version == "2019":
sheet_list = ["ESTIMATES"] # "MEDIUM VARIANT"
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221
elif version == "2024":
sheet_list = ["Estimates"] # "Medium variant"
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Year" # noqa: E221
else:
raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))
cols_to_read = [COUNTRY_COL, PERIOD_COL]
cols_to_read.extend(age_cols)
# --------------------------------
# --- Extract data from the sheets
# --------------------------------
df = pd.DataFrame()
for sheet in sheet_list:
df_sheet = pd.read_excel(filename,
sheet_name=sheet,
skiprows=16,
usecols=cols_to_read,
engine="calamine")
_check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
df = pd.concat([df, df_sheet])
df = df.drop(COUNTRY_COL, axis=1)
# -----------------------------------
# --- Get the one row of data we want
# -----------------------------------
df = df[df[PERIOD_COL] == year]
# ---------------------------------------------------------------
# --- In the 2012, 2015 data, the Estimates sheet has this "80+" column
# --- that can be used instead of the 80-84, etc columns.
# --- NOTE: This is crappy logic, but I need to move on.
# ---------------------------------------------------------------
if version == "2012" or version == "2015":
val = df[AGE_80_PLUS].iloc[0]
has_80p = isinstance(val, float) or isinstance(val, int)
if has_80p:
df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1)
for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]:
age_cols.remove(col)
else:
df = df.drop(AGE_80_PLUS, axis=1)
age_cols.remove(AGE_80_PLUS)
# -----------------------------------------------------------
# --- rename the age columns so they are just the minimum age
# -----------------------------------------------------------
new_col_names = {}
for col in age_cols:
if col == AGE_80_PLUS:
new_col_names[col] = "80"
elif len(col) == 3:
new_col_names[col] = col[0:1]
elif len(col) == 5:
new_col_names[col] = col[0:2]
else:
new_col_names[col] = col[0:3]
df.rename(columns=new_col_names, inplace=True)
# -------------------------------------------------------------------
# --- Transpose the data so we have rows of min ages for our one Year
# -------------------------------------------------------------------
df = df.set_index(PERIOD_COL, inplace=False)
df = df.transpose()
df = df.rename_axis("tmp_min_age")
# ------------------------------------------------------------------
# --- Some columns can have the "...". This changes those to zeros.
# --- (This could go away with the 80+ handling.)
# ------------------------------------------------------------------
df[year] = (pd.to_numeric(df[year], errors='coerce').fillna(0))
# ---------------------------------------------------------------
# --- Change the data to the fraction of people in that age range
# --- since this is supposed to be an age distribution.
# ---------------------------------------------------------------
total_pop = df[year].sum()
df = df / total_pop
# -----------------------------------------------------------------
# --- The data is in thousands of people so multiply by 1000
# --- and convert to integer since we want the value as an integer.
# -----------------------------------------------------------------
total_pop = int(1000 * total_pop)
# --------------------------------------------------
# --- Convert dataframe into a YearAgeRate dataframe
# --------------------------------------------------
df[YearAgeRate.COL_NAME_MIN_AGE ] = df.index.values # noqa: E202
df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE].astype(float) # noqa: E202
df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202
df[YearAgeRate.COL_NAME_MIN_YEAR] = year
df.rename({year: YearAgeRate.COL_NAME_RATE}, axis=1, inplace=True)
df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(8)
df = df[YearAgeRate.COL_NAMES]
df.reset_index()
return total_pop, YearAgeRate(df)
[docs]def extract_fertility(country: str,
version: str,
filename: Union[str, Path] = None):
"""
This code is for extracting fertility rates for the given country from the fertility files
downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Fertility/).
The code assumes that the files is in Strict Open XML Spreadsheet format. This format require
us to use the 'calamine' engine to read the file. The file is expected to be the Age Specific
rates (i.e. WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY, WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER)
Args:
country
The name of the country used in the spreadsheet for which you want to extract the data.
version
A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024
filename
If not provided, the 'version' will be used to select from the known versions. If a filename
is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
but the data will be extracted from the first two. The 'estimates' sheet gives you data for the
past while the 'medium variant' sheet gives you the data for the future. It is expected to be in
the Strict Open XML Spreadsheet format.
Return
A YearAgeRate object containing the fertility data in the given file.
"""
if filename is None:
filename = _get_fertility_filename(version)
else:
_check_filename(filename)
# ---------------------------------------------------------
# --- Define column names and sheets to read the data from.
# ---------------------------------------------------------
PERIOD_COL = "Period" # noqa: E221
AGE_15_19 = "15-19" # noqa: E221
AGE_20_24 = "20-24" # noqa: E221
AGE_25_29 = "25-29" # noqa: E221
AGE_30_34 = "30-34" # noqa: E221
AGE_35_39 = "35-39" # noqa: E221
AGE_40_44 = "40-44" # noqa: E221
AGE_45_49 = "45-49" # noqa: E221
age_cols = [AGE_15_19, AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39, AGE_40_44, AGE_45_49]
if version == "2012":
sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"]
COUNTRY_COL = "Major area, region, country or area *"
elif version == "2015":
sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
COUNTRY_COL = "Major area, region, country or area *"
elif version == "2019":
sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
COUNTRY_COL = "Region, subregion, country or area *"
elif version == "2024":
sheet_list = ["Estimates", "Medium variant"]
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Year" # noqa: E221
AGE_10_14 = "10-14"
AGE_50_54 = "50-54"
age_cols.insert(0, AGE_10_14)
age_cols.append(AGE_50_54)
else:
raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))
cols_to_read = [COUNTRY_COL, PERIOD_COL]
cols_to_read.extend(age_cols)
# --------------------------------
# --- Extract data from the sheets
# --------------------------------
df = pd.DataFrame()
for sheet in sheet_list:
df_sheet = pd.read_excel(filename,
sheet_name=sheet,
skiprows=16,
usecols=cols_to_read,
engine="calamine")
_check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
df = pd.concat([df, df_sheet])
df = df.drop(COUNTRY_COL, axis=1)
# ------------------------------------------------------------
# --- Change the Period column to only contain min value/year
# ------------------------------------------------------------
if version == "2012" or version == "2015" or version == "2019":
df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float)
# -----------------------------------------------------------
# --- rename the age columns so they are just the minimum age
# -----------------------------------------------------------
new_col_names = {}
for col in age_cols:
new_col_names[col] = col[0:2]
df.rename(columns=new_col_names, inplace=True)
df = pd.melt(df, id_vars=[PERIOD_COL], var_name=YearAgeRate.COL_NAME_MIN_AGE, value_name=YearAgeRate.COL_NAME_RATE)
# ----------------------------------------------------------
# --- Convert dataframe into a YearAgeRate dataframe format
# ----------------------------------------------------------
df.rename({PERIOD_COL: YearAgeRate.COL_NAME_MIN_YEAR}, axis=1, inplace=True)
df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float) # noqa: E202
df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float)
df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202
df = df.sort_values(by=[YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], ascending=True)
df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(1)
df = df[YearAgeRate.COL_NAMES]
return YearAgeRate(df=df)
[docs]def extract_mortality(country: str,
version: str,
gender: str = None,
filename: Union[str, Path] = None):
"""
This code is for extracting mortality rates for the given country from the mortality files
downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Mortality/).
The code assumes that the files is in Strict Open XML Spreadsheet format. This format require
us to use the 'calamine' engine to read the file. The file is expected to be for one gender
and to be the Abriged Life Table.
Args:
country:
The name of the country used in the spreadsheet for which you want to extract the data.
version:
A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024
gender:
The gender of the data to be extracted. Possible values are 'male' and 'female'.
Required if the filename is not provided.
filename:
If not provided, the 'version' will be used to select from the known versions. If a filename
is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
but the data will be extracted from the first one plus the next one or two. The 'estimates'
sheet gives you data for the past while the 'medium XXX' sheets give you the data for the future.
It is expected to be in the Strict Open XML Spreadsheet format.
Return
A YearAgeRate object containing the mortality data in the given file.
"""
if filename is None:
filename = _get_mortality_filename(version, gender)
else:
_check_filename(filename)
# ---------------------------------------------------------
# --- Define column names and sheets to read the data from.
# ---------------------------------------------------------
COUNTRY_COL = "Region" # noqa: E221
PERIOD_COL = "Period" # noqa: E221
AGE_COL = "Age (x)" # noqa: E221
AGE_INTERVAL_COL = "Age interval (n)"
DEATH_RATE_COL = "Central death rate m(x,n)" # noqa: E221
sheet_list = []
if version == "2012":
COUNTRY_COL = "Major area, region, country or area *"
sheet_list = ["ESTIMATES", "MEDIUM_2010-2050", "MEDIUM_2050-2100"]
elif version == "2015":
COUNTRY_COL = "Major area, region, country or area *"
sheet_list = ["ESTIMATES", "MEDIUM 2015-2050", "MEDIUM 2050-2100"]
elif version == "2019":
COUNTRY_COL = "Region, subregion, country or area *"
sheet_list = ["ESTIMATES", "MEDIUM 2020-2050", "MEDIUM 2050-2100"]
elif version == "2024":
COUNTRY_COL = "Region, subregion, country or area *"
PERIOD_COL = "Year" # noqa: E221
sheet_list = ["Estimates", "Medium variant"]
else:
raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))
cols_to_read = [COUNTRY_COL, PERIOD_COL, AGE_COL, AGE_INTERVAL_COL, DEATH_RATE_COL]
col_rename_dict = {
PERIOD_COL : YearAgeRate.COL_NAME_MIN_YEAR, # noqa: E203
AGE_COL : YearAgeRate.COL_NAME_MIN_AGE, # noqa: E203
DEATH_RATE_COL: YearAgeRate.COL_NAME_RATE
}
# --------------------------------
# --- Extract data from the sheets
# --------------------------------
df = pd.DataFrame()
for sheet in sheet_list:
df_sheet = pd.read_excel(filename,
sheet_name=sheet,
skiprows=16,
usecols=cols_to_read,
engine="calamine")
_check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
df = pd.concat([df, df_sheet])
# ------------------------------------------------------------
# --- Change the Period column to only contain min value/year
# ------------------------------------------------------------
if version == "2012" or version == "2015" or version == "2019":
df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float)
# ----------------------------------------------------------
# --- Convert dataframe into a YearAgeRate dataframe format
# ----------------------------------------------------------
df.rename(col_rename_dict, axis=1, inplace=True)
df = df.drop([COUNTRY_COL, AGE_INTERVAL_COL], axis=1)
df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202
df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float)
df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float) # noqa: E202
df[YearAgeRate.COL_NAME_RATE ] = df[YearAgeRate.COL_NAME_RATE ].round(8) # noqa: E202
df = df[YearAgeRate.COL_NAMES]
return YearAgeRate(df=df)