Source code for emodpy_hiv.demographics.un_world_pop

"""
This module contains methods for extracting data from UN World Population files.
https://population.un.org/wpp/

These methods output dataframes that can be used to initialize Demographic objects in EMOD.
"""
from typing import Union
from pathlib import Path
from importlib import resources
import pandas as pd
from emodpy_hiv.demographics.year_age_rate import YearAgeRate
import emodpy_hiv.countries.un_world_pop_data as un_data


POPULATION_FILES = {
    "2012": "WPP2012_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.XLS",
    "2015": "WPP2015_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.XLS",
    "2019": "WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx",
    "2024": "WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx"
}

FERTILITY_FILES = {
    "2012": "WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY.xlsx",
    "2015": "WPP2015_FERT_F07_AGE_SPECIFIC_FERTILITY.XLS",
    "2019": "WPP2019_FERT_F07_AGE_SPECIFIC_FERTILITY.xlsx",
    "2024": "WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER.xlsx"
}

MORTALITY_FILES = {
    "2012": {
        "male":   "WPP2012_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
        "female": "WPP2012_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
    },
    "2015": {
        "male":   "WPP2015_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.XLS",
        "female": "WPP2015_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.XLS"
    },
    "2019": {
        "male":   "WPP2019_MORT_F17_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
        "female": "WPP2019_MORT_F17_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
    },
    "2024": {
        "male":   "WPP2024_MORT_F07_2_ABRIDGED_LIFE_TABLE_MALE.xlsx",
        "female": "WPP2024_MORT_F07_3_ABRIDGED_LIFE_TABLE_FEMALE.xlsx"
    }
}

KNOWN_VERSIONS = ["2012", "2015", "2019", "2024"]
KNOWN_GENDERS = ["male", "female"]

def _check_version(version):
    if version not in KNOWN_VERSIONS:
        supported_versions = ""
        for index, v in enumerate(KNOWN_VERSIONS):
            supported_versions += v
            if (index + 1) < len(KNOWN_VERSIONS):
                supported_versions += ", "
        raise ValueError(f"'version'= '{version}' is not supported.\nOnly {supported_versions} formats are supported.")


def _check_gender(gender):
    if gender not in KNOWN_GENDERS:
        raise ValueError(f"'gender'= '{gender}' is not supported.\nOnly 'male' and 'female' are supported.")


def _get_population_filename(version):
    _check_version(version)
    un_data_root = resources.files(un_data)
    filename = Path(un_data_root, POPULATION_FILES[version])
    return filename


def _get_fertility_filename(version):
    _check_version(version)
    un_data_root = resources.files(un_data)
    filename = Path(un_data_root, FERTILITY_FILES[version])
    return filename


def _get_mortality_filename(version, gender):
    _check_version(version)
    _check_gender(gender)
    un_data_root = resources.files(un_data)
    filename = Path(un_data_root, MORTALITY_FILES[version][gender])
    return filename


def _check_country(country, possible_countries, filename):
    if country not in possible_countries:
        raise ValueError(f"'country'= '{country}' is not supported.\nThe file,\n{filename},\nonly supports the following countries:\n{possible_countries}")


def _check_filename(filename):
    if ((isinstance(filename, Path) and not filename.exists()) or
        (isinstance(filename, str) and not Path(filename).exists())):
        raise ValueError(f"The file does not exist.\n{filename}")


def _check_years(years, possible_years, filename):
    for year in years:
        if year not in possible_years:
            raise ValueError(f"'year'= '{year}' is not supported.\nThe file,\n{filename},\nonly supports the following years:\n{possible_years}")


[docs]def extract_population_by_age(country: str, version: str, years: list[float], filename: Union[str,Path] = None): """ This code is for extracting population by age data from the files downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Population/). The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be for both sexes and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx, WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx) Args: country: The name of the country used in the spreadsheet for which you want to extract the data. version: A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024 years: A list of years to get data for NOTE: Data files say it is for July 1st of the given year. filename: If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first two. The 'estimates' sheet gives you data for the past while the 'medium variant' sheet gives you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format. Return It will return a pandas DataFrame where each row is a year and the columns are for an age range """ if filename is None: filename = _get_population_filename(version) else: _check_filename(filename) # --------------------------------------------------------- # --- Define column names and sheets to read the data from. # --------------------------------------------------------- COUNTRY_COL = "Major area, region, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 AGE_00_04 = "0-4" # noqa: E221 AGE_05_09 = "5-9" # noqa: E221 AGE_10_14 = "10-14" # noqa: E221 AGE_15_19 = "15-19" # noqa: E221 AGE_20_24 = "20-24" # noqa: E221 AGE_25_29 = "25-29" # noqa: E221 AGE_30_34 = "30-34" # noqa: E221 AGE_35_39 = "35-39" # noqa: E221 AGE_40_44 = "40-44" # noqa: E221 AGE_45_49 = "45-49" # noqa: E221 AGE_50_54 = "50-54" # noqa: E221 AGE_55_59 = "55-59" # noqa: E221 AGE_60_64 = "60-64" # noqa: E221 AGE_65_69 = "65-69" # noqa: E221 AGE_70_74 = "70-74" # noqa: E221 AGE_75_79 = "75-79" # noqa: E221 AGE_80_84 = "80-84" # noqa: E221 AGE_85_89 = "85-89" # noqa: E221 AGE_90_94 = "90-94" # noqa: E221 AGE_95_99 = "95-99" # noqa: E221 AGE_100 = "100+" # noqa: E221 AGE_80_PLUS = "80+" # noqa: E221 sheet_list = [] age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19, AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39, AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59, AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79, AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100] medium_cols = age_cols if version == "2012": sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"] COUNTRY_COL = "Major area, region, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 medium_cols = age_cols.copy() age_cols.append(AGE_80_PLUS) elif version == "2015": sheet_list = ["ESTIMATES", "MEDIUM VARIANT"] COUNTRY_COL = "Major area, region, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 medium_cols = age_cols.copy() age_cols.append(AGE_80_PLUS) elif version == "2019": sheet_list = ["ESTIMATES", "MEDIUM VARIANT"] COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 elif version == "2024": sheet_list = ["Estimates", "Medium variant"] COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Year" # noqa: E221 else: raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version)) # -------------------------------- # --- Extract data from the sheets # -------------------------------- df = pd.DataFrame() for sheet in sheet_list: cols_to_read = [COUNTRY_COL, PERIOD_COL] if "estimate" in sheet.lower(): cols_to_read.extend(age_cols) else: cols_to_read.extend(medium_cols) df_sheet = pd.read_excel(filename, sheet_name=sheet, skiprows=16, usecols=cols_to_read, engine="calamine") _check_country(country, df_sheet[COUNTRY_COL].unique(), filename) df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country] df = pd.concat([df, df_sheet]) df = df.drop(COUNTRY_COL, axis=1) # ----------------------------------- # --- Get the rows of data we want # ----------------------------------- _check_years(years, df[PERIOD_COL].unique(), filename) df = df[df[PERIOD_COL].isin(years)] # --------------------------------------------------------------- # --- In the 2012, 2015 data, the Estimates sheet has this "80+" column # --- that can be used instead of the 80-84, etc columns. # --- NOTE: This is crappy logic, but I need to move on. # --------------------------------------------------------------- if version == "2012" or version == "2015": val = df[AGE_80_PLUS].iloc[0] has_80p = isinstance(val, float) or isinstance(val, int) if has_80p: df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1) for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]: age_cols.remove(col) else: df = df.drop(AGE_80_PLUS, axis=1) age_cols.remove(AGE_80_PLUS) # ----------------------------------------------------------- # --- rename the age columns so they are just the minimum age # ----------------------------------------------------------- new_col_names = {} for col in age_cols: if col == AGE_80_PLUS: new_col_names[col] = "80" elif len(col) == 3: new_col_names[col] = col[0:1] elif len(col) == 5: new_col_names[col] = col[0:2] else: new_col_names[col] = col[0:3] df.rename(columns=new_col_names, inplace=True) # ------------------------------------------------------------------ # --- Some columns can have the "...". This changes those to zeros. # --- (This could go away with the 80+ handling.) # ------------------------------------------------------------------ #df[year] = ( pd.to_numeric(df[year], errors='coerce' ).fillna(0) ) # ----------------------------------------------------------------- # --- The data is in thousands of people so multiply by 1000 # --- and convert to integer since we want the value as an integer. # ----------------------------------------------------------------- for column in df.columns: if column != PERIOD_COL: df[column] = df[column] * 1000 return df
[docs]def extract_population_by_age_for_ingest_form(filename, country, version, years, gender): df = extract_population_by_age(filename=filename, country=country, version=version, years=years) ret_df = pd.DataFrame() ret_df["Year" ] = 0 # noqa: E202 ret_df["Gender" ] = 0 # noqa: E202 ret_df["AgeBin" ] = 0 # noqa: E202 ret_df["Population"] = 0 for col_name in df.columns: if col_name != "Reference date (as of 1 July)": for index, value in df[col_name].items(): year = df["Reference date (as of 1 July)"][index] age_bin = f"[{str(col_name)}:{str(int(col_name)+5)})" ret_df.loc[len(ret_df)] = [year, gender, age_bin, value] return ret_df
[docs]def extract_population_by_age_and_distribution(country: str, version: str, year: int = 1960, filename: Union[str, Path] = None): """ This code is for extracting population by age data from the files downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Population/). The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be for both sexes and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx, WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx) Args: country: The name of the country used in the spreadsheet for which you want to extract the data. version: A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024 year: The year in the data to get the total population and age distribution for. Default is 1960 NOTE: Data files say it is for July 1st of the given year. filename: If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be only be extracted from the first sheet, 'estimates'. The 'estimates' sheet gives you data for the past. It is assumed that you are using this function to get the starting point of an EMOD simulation that starts in the past. It is expected to be in the Strict Open XML Spreadsheet format. Return: It will return the total population for the given year PLUS a YearAgeRate object where the "rate" column contains the fraction of people in that particular year and age ranges. """ if filename is None: filename = _get_population_filename(version) else: _check_filename(filename) # --------------------------------------------------------- # --- Define column names and sheets to read the data from. # --------------------------------------------------------- COUNTRY_COL = "Major area, region, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 AGE_00_04 = "0-4" # noqa: E221 AGE_05_09 = "5-9" # noqa: E221 AGE_10_14 = "10-14" # noqa: E221 AGE_15_19 = "15-19" # noqa: E221 AGE_20_24 = "20-24" # noqa: E221 AGE_25_29 = "25-29" # noqa: E221 AGE_30_34 = "30-34" # noqa: E221 AGE_35_39 = "35-39" # noqa: E221 AGE_40_44 = "40-44" # noqa: E221 AGE_45_49 = "45-49" # noqa: E221 AGE_50_54 = "50-54" # noqa: E221 AGE_55_59 = "55-59" # noqa: E221 AGE_60_64 = "60-64" # noqa: E221 AGE_65_69 = "65-69" # noqa: E221 AGE_70_74 = "70-74" # noqa: E221 AGE_75_79 = "75-79" # noqa: E221 AGE_80_84 = "80-84" # noqa: E221 AGE_85_89 = "85-89" # noqa: E221 AGE_90_94 = "90-94" # noqa: E221 AGE_95_99 = "95-99" # noqa: E221 AGE_100 = "100+" # noqa: E221 AGE_80_PLUS = "80+" # noqa: E221 sheet_list = [] age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19, AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39, AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59, AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79, AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100] if version == "2012" or version == "2015": sheet_list = ["ESTIMATES"] # "MEDIUM FERTILITY" COUNTRY_COL = "Major area, region, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 age_cols.append(AGE_80_PLUS) elif version == "2019": sheet_list = ["ESTIMATES"] # "MEDIUM VARIANT" COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Reference date (as of 1 July)" # noqa: E221 elif version == "2024": sheet_list = ["Estimates"] # "Medium variant" COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Year" # noqa: E221 else: raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version)) cols_to_read = [COUNTRY_COL, PERIOD_COL] cols_to_read.extend(age_cols) # -------------------------------- # --- Extract data from the sheets # -------------------------------- df = pd.DataFrame() for sheet in sheet_list: df_sheet = pd.read_excel(filename, sheet_name=sheet, skiprows=16, usecols=cols_to_read, engine="calamine") _check_country(country, df_sheet[COUNTRY_COL].unique(), filename) df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country] df = pd.concat([df, df_sheet]) df = df.drop(COUNTRY_COL, axis=1) # ----------------------------------- # --- Get the one row of data we want # ----------------------------------- df = df[df[PERIOD_COL] == year] # --------------------------------------------------------------- # --- In the 2012, 2015 data, the Estimates sheet has this "80+" column # --- that can be used instead of the 80-84, etc columns. # --- NOTE: This is crappy logic, but I need to move on. # --------------------------------------------------------------- if version == "2012" or version == "2015": val = df[AGE_80_PLUS].iloc[0] has_80p = isinstance(val, float) or isinstance(val, int) if has_80p: df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1) for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]: age_cols.remove(col) else: df = df.drop(AGE_80_PLUS, axis=1) age_cols.remove(AGE_80_PLUS) # ----------------------------------------------------------- # --- rename the age columns so they are just the minimum age # ----------------------------------------------------------- new_col_names = {} for col in age_cols: if col == AGE_80_PLUS: new_col_names[col] = "80" elif len(col) == 3: new_col_names[col] = col[0:1] elif len(col) == 5: new_col_names[col] = col[0:2] else: new_col_names[col] = col[0:3] df.rename(columns=new_col_names, inplace=True) # ------------------------------------------------------------------- # --- Transpose the data so we have rows of min ages for our one Year # ------------------------------------------------------------------- df = df.set_index(PERIOD_COL, inplace=False) df = df.transpose() df = df.rename_axis("tmp_min_age") # ------------------------------------------------------------------ # --- Some columns can have the "...". This changes those to zeros. # --- (This could go away with the 80+ handling.) # ------------------------------------------------------------------ df[year] = (pd.to_numeric(df[year], errors='coerce').fillna(0)) # --------------------------------------------------------------- # --- Change the data to the fraction of people in that age range # --- since this is supposed to be an age distribution. # --------------------------------------------------------------- total_pop = df[year].sum() df = df / total_pop # ----------------------------------------------------------------- # --- The data is in thousands of people so multiply by 1000 # --- and convert to integer since we want the value as an integer. # ----------------------------------------------------------------- total_pop = int(1000 * total_pop) # -------------------------------------------------- # --- Convert dataframe into a YearAgeRate dataframe # -------------------------------------------------- df[YearAgeRate.COL_NAME_MIN_AGE ] = df.index.values # noqa: E202 df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE].astype(float) # noqa: E202 df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202 df[YearAgeRate.COL_NAME_MIN_YEAR] = year df.rename({year: YearAgeRate.COL_NAME_RATE}, axis=1, inplace=True) df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(8) df = df[YearAgeRate.COL_NAMES] df.reset_index() return total_pop, YearAgeRate(df)
[docs]def extract_fertility(country: str, version: str, filename: Union[str, Path] = None): """ This code is for extracting fertility rates for the given country from the fertility files downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Fertility/). The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be the Age Specific rates (i.e. WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY, WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER) Args: country The name of the country used in the spreadsheet for which you want to extract the data. version A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024 filename If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first two. The 'estimates' sheet gives you data for the past while the 'medium variant' sheet gives you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format. Return A YearAgeRate object containing the fertility data in the given file. """ if filename is None: filename = _get_fertility_filename(version) else: _check_filename(filename) # --------------------------------------------------------- # --- Define column names and sheets to read the data from. # --------------------------------------------------------- PERIOD_COL = "Period" # noqa: E221 AGE_15_19 = "15-19" # noqa: E221 AGE_20_24 = "20-24" # noqa: E221 AGE_25_29 = "25-29" # noqa: E221 AGE_30_34 = "30-34" # noqa: E221 AGE_35_39 = "35-39" # noqa: E221 AGE_40_44 = "40-44" # noqa: E221 AGE_45_49 = "45-49" # noqa: E221 age_cols = [AGE_15_19, AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39, AGE_40_44, AGE_45_49] if version == "2012": sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"] COUNTRY_COL = "Major area, region, country or area *" elif version == "2015": sheet_list = ["ESTIMATES", "MEDIUM VARIANT"] COUNTRY_COL = "Major area, region, country or area *" elif version == "2019": sheet_list = ["ESTIMATES", "MEDIUM VARIANT"] COUNTRY_COL = "Region, subregion, country or area *" elif version == "2024": sheet_list = ["Estimates", "Medium variant"] COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Year" # noqa: E221 AGE_10_14 = "10-14" AGE_50_54 = "50-54" age_cols.insert(0, AGE_10_14) age_cols.append(AGE_50_54) else: raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version)) cols_to_read = [COUNTRY_COL, PERIOD_COL] cols_to_read.extend(age_cols) # -------------------------------- # --- Extract data from the sheets # -------------------------------- df = pd.DataFrame() for sheet in sheet_list: df_sheet = pd.read_excel(filename, sheet_name=sheet, skiprows=16, usecols=cols_to_read, engine="calamine") _check_country(country, df_sheet[COUNTRY_COL].unique(), filename) df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country] df = pd.concat([df, df_sheet]) df = df.drop(COUNTRY_COL, axis=1) # ------------------------------------------------------------ # --- Change the Period column to only contain min value/year # ------------------------------------------------------------ if version == "2012" or version == "2015" or version == "2019": df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float) # ----------------------------------------------------------- # --- rename the age columns so they are just the minimum age # ----------------------------------------------------------- new_col_names = {} for col in age_cols: new_col_names[col] = col[0:2] df.rename(columns=new_col_names, inplace=True) df = pd.melt(df, id_vars=[PERIOD_COL], var_name=YearAgeRate.COL_NAME_MIN_AGE, value_name=YearAgeRate.COL_NAME_RATE) # ---------------------------------------------------------- # --- Convert dataframe into a YearAgeRate dataframe format # ---------------------------------------------------------- df.rename({PERIOD_COL: YearAgeRate.COL_NAME_MIN_YEAR}, axis=1, inplace=True) df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float) # noqa: E202 df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float) df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202 df = df.sort_values(by=[YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], ascending=True) df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(1) df = df[YearAgeRate.COL_NAMES] return YearAgeRate(df=df)
[docs]def extract_mortality(country: str, version: str, gender: str = None, filename: Union[str, Path] = None): """ This code is for extracting mortality rates for the given country from the mortality files downloaded from the UN World Pop (https://population.un.org/wpp/Download/Standard/Mortality/). The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be for one gender and to be the Abriged Life Table. Args: country: The name of the country used in the spreadsheet for which you want to extract the data. version: A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024 gender: The gender of the data to be extracted. Possible values are 'male' and 'female'. Required if the filename is not provided. filename: If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first one plus the next one or two. The 'estimates' sheet gives you data for the past while the 'medium XXX' sheets give you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format. Return A YearAgeRate object containing the mortality data in the given file. """ if filename is None: filename = _get_mortality_filename(version, gender) else: _check_filename(filename) # --------------------------------------------------------- # --- Define column names and sheets to read the data from. # --------------------------------------------------------- COUNTRY_COL = "Region" # noqa: E221 PERIOD_COL = "Period" # noqa: E221 AGE_COL = "Age (x)" # noqa: E221 AGE_INTERVAL_COL = "Age interval (n)" DEATH_RATE_COL = "Central death rate m(x,n)" # noqa: E221 sheet_list = [] if version == "2012": COUNTRY_COL = "Major area, region, country or area *" sheet_list = ["ESTIMATES", "MEDIUM_2010-2050", "MEDIUM_2050-2100"] elif version == "2015": COUNTRY_COL = "Major area, region, country or area *" sheet_list = ["ESTIMATES", "MEDIUM 2015-2050", "MEDIUM 2050-2100"] elif version == "2019": COUNTRY_COL = "Region, subregion, country or area *" sheet_list = ["ESTIMATES", "MEDIUM 2020-2050", "MEDIUM 2050-2100"] elif version == "2024": COUNTRY_COL = "Region, subregion, country or area *" PERIOD_COL = "Year" # noqa: E221 sheet_list = ["Estimates", "Medium variant"] else: raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version)) cols_to_read = [COUNTRY_COL, PERIOD_COL, AGE_COL, AGE_INTERVAL_COL, DEATH_RATE_COL] col_rename_dict = { PERIOD_COL : YearAgeRate.COL_NAME_MIN_YEAR, # noqa: E203 AGE_COL : YearAgeRate.COL_NAME_MIN_AGE, # noqa: E203 DEATH_RATE_COL: YearAgeRate.COL_NAME_RATE } # -------------------------------- # --- Extract data from the sheets # -------------------------------- df = pd.DataFrame() for sheet in sheet_list: df_sheet = pd.read_excel(filename, sheet_name=sheet, skiprows=16, usecols=cols_to_read, engine="calamine") _check_country(country, df_sheet[COUNTRY_COL].unique(), filename) df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country] df = pd.concat([df, df_sheet]) # ------------------------------------------------------------ # --- Change the Period column to only contain min value/year # ------------------------------------------------------------ if version == "2012" or version == "2015" or version == "2019": df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float) # ---------------------------------------------------------- # --- Convert dataframe into a YearAgeRate dataframe format # ---------------------------------------------------------- df.rename(col_rename_dict, axis=1, inplace=True) df = df.drop([COUNTRY_COL, AGE_INTERVAL_COL], axis=1) df[YearAgeRate.COL_NAME_NODE_ID ] = 0 # noqa: E202 df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float) df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float) # noqa: E202 df[YearAgeRate.COL_NAME_RATE ] = df[YearAgeRate.COL_NAME_RATE ].round(8) # noqa: E202 df = df[YearAgeRate.COL_NAMES] return YearAgeRate(df=df)