Source code for hpvsim.data.downloaders

'''
Download data needed for HPVsim.

Typically, this is done automatically: on load, HPVsim checks if the data are already
downloaded, and if not, downloads them using the quick_download() function. The
"slow download" functions supply the files that are usually zipped and stored in
a separate repository, hpvsim_data.

To ensure the data is updated, update the data_version parameter below.

Running this file as a script will remove and then re-download all data.
'''

import os
import sys
import numpy as np
import pandas as pd
import sciris as sc
from hpvsim.data import loaders as ld

# Set parameters
data_version = '1.3' # Data version
data_file = f'hpvsim_data_v{data_version}.zip'
quick_url = f'https://github.com/amath-idm/hpvsim_data/blob/main/{data_file}?raw=true'
age_stem = 'WPP2022_Population1JanuaryBySingleAgeSex_Medium_'
death_stem = 'WPP2022_Life_Table_Abridged_Medium_'
base_url = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/'
years = ['1950-2021', '2022-2100']


__all__ = ['get_data', 'quick_download', 'check_downloaded', 'remove_data']


# Define here to optionally be overwritten
filesdir = ld.filesdir

def set_filesdir(path):
    ''' Used to change the file folder '''
    global filesdir
    orig = filesdir
    filesdir = path
    print(f'Done: filesdir reset from {orig} to {filesdir}')
    return


def get_UN_data(label='', file_stem=None, outfile=None, columns=None, force=None, tidy=None):
    ''' Download data from UN Population Division '''
    if force is None: force = False
    if tidy  is None: tidy  = True

    sc.heading(f'Getting {label} data...')
    T = sc.timer()
    dfs = []

    # Download data if it's not already in the directory
    for year in years:
        url = f'{base_url}{file_stem}{year}.zip'
        local_base = filesdir/f'{file_stem}{year}'
        local_zip = f'{local_base}.zip'
        local_csv = f'{local_base}.csv'
        if force or not os.path.exists(local_csv):
            print(f'\nDownloading from {url}, this may take a while...')
            sc.download(url, filename=local_zip)
            sc.unzip(local_zip, outfolder=filesdir)
        else:
            print(f'Skipping {local_csv}, already downloaded')

        # Extract the parts used in the model and save
        df = pd.read_csv(local_csv, usecols=columns)
        dfs.append(df)
        if tidy:
            print(f'Removing {local_base}')
            sc.rmpath(local_zip, die=False)
            sc.rmpath(local_csv, die=False)
        T.toctic(label=f'  Done with {label} for {year}')

    # Parse by location
    df = pd.concat(dfs)
    dd = sc.objdict({l:d for l,d in df.groupby('Location')})
    assert dd[0][columns[-1]].dtype != object, "Last column should be numeric type, not mixed or string type"
        
    sc.save(filesdir/outfile, dd)
    T.toc(f'Done with {label}')

    return dd


def get_age_data(force=None, tidy=None):
    ''' Import population sizes by age from UNPD '''
    columns = ["Location", "Time", "AgeGrpStart", "PopTotal"]
    outfile = 'populations.obj'
    kw = dict(label='age', file_stem=age_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
    return get_UN_data(**kw)


def get_death_data(force=None, tidy=None):
    ''' Import age-specific death rates and population distributions from UNPD '''
    columns = ["Location", "Time", "Sex", "AgeGrpStart", "mx"]
    outfile = 'mx.obj'
    kw = dict(label='death', file_stem=death_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
    return get_UN_data(**kw)


def get_ex_data(force=None, tidy=None):
    ''' Import age-specific life expectancy and population distributions from UNPD '''
    columns = ["Location", "Time", "Sex", "AgeGrpStart", "ex"]
    outfile = 'ex.obj'
    kw = dict(label='ex', file_stem=death_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
    return get_UN_data(**kw)


def get_birth_data(start=1960, end=2020, force=None, tidy=None):
    ''' Import crude birth rates from WB '''
    sc.heading('Downloading World Bank birth rate data...')
    try:
        import wbgapi as wb
    except Exception as E:
        errormsg = 'Could not import wbgapi: cannot download raw data'
        raise ModuleNotFoundError(errormsg) from E
    T = sc.timer()
    birth_rates = wb.data.DataFrame('SP.DYN.CBRT.IN', time=range(start,end), labels=True, skipAggs=True).reset_index()
    d = dict()
    for country in birth_rates['Country'].unique():
        d[country] = birth_rates.loc[(birth_rates['Country']==country)].values[0,3:]
        d[country] = d[country].astype(float) # Loaded as an object otherwise!
    d['years'] = np.arange(start, end)
    sc.save(filesdir/'birth_rates.obj', d)
    
    if tidy:
        print(f'Removing {local_path}')
        sc.rmpath(local_path, die=False)
            
    T.toc(label='Done with birth data')
    return d


def parallel_downloader(which, **kwargs):
    ''' Function for use with a parallel download function '''
    if which in ['age', 'ages']:
        get_age_data(**kwargs)
    if which in ['birth', 'births']:
        get_birth_data(**kwargs)
    if which in ['death', 'deaths']:
        get_death_data(**kwargs)
    if which in ['life_expectancy', 'ex']:
        get_ex_data(**kwargs)
    return


[docs] def get_data(serial=False, **kwargs): ''' Download data in parallel ''' sc.heading('Downloading HPVsim data, please be patient...') T = sc.timer() if len(sys.argv) > 1: which = sys.argv[1] if which not in ['all', 'age', 'ages', 'birth', 'births', 'death', 'deaths']: errormsg = f'Invalid selection "{which}": must be all, ages, births, or deaths' raise ValueError(errormsg) else: which = 'all' if which == 'all': which = ['age', 'birth', 'death', 'life_expectancy'] # Actually download sc.parallelize(parallel_downloader, which, kwargs=kwargs, serial=serial) T.toc('Done downloading data for HPVsim') return
[docs] def quick_download(verbose=True, init=False): ''' Download pre-processed data files ''' if verbose: sc.heading('Downloading preprocessed HPVsim data') if init: print('Note: this automatic download only happens once, when HPVsim is first run.\n\n') filepath = sc.makefilepath(filesdir / f'tmp_{data_file}.zip') sc.download(url=quick_url, filename=filepath, convert=False, verbose=verbose) sc.unzip(filepath, outfolder=filesdir) sc.rmpath(filepath) if verbose: print('\nData downloaded.') return
[docs] def check_downloaded(verbose=1, check_version=True): ''' Check if data is downloaded. Note: to update data, update the date here and in data/files/metadata.json. Args: verbose (int): detail to print (0 = none, 1 = reason for failure, 2 = everything) check_version (bool): whether to treat a version mismatch as a failure ''' # Do file checks exists = dict() for key,fn in ld.files.items(): exists[key] = os.path.exists(fn) if verbose>1: print(f'HPVsim data: checking {fn}: {exists[key]}') ok = all(list(exists.values())) if not ok and verbose: print(f'HPVsim data: at least one file missing: {exists}') elif ok and verbose>1: print('HPVsim data: all files exist') # Do version check (if files exist) if ok and check_version: metadata = sc.loadjson(ld.files.metadata) match = metadata['version'] == data_version if verbose: if not match and verbose: print(f'HPVsim data: versions do not match ({metadata["version"]} != {data_version})') elif match and verbose>1: print(f'HPVsim data: versions match ({data_version})') ok = ok and match return ok
[docs] def remove_data(verbose=True, **kwargs): ''' Remove downloaded data; arguments passed to sc.rmpath() ''' if verbose: sc.heading('Removing HPVsim data files') for key,fn in ld.files.items(): sc.rmpath(fn, verbose=verbose, **kwargs) if verbose: print('Data files removed.') return
if __name__ == '__main__': ans = input('Are you sure you want to remove and redownload data? y/[n] ') if ans == 'y': remove_data() get_data() check_downloaded()