'''
Download data needed for HPVsim.
Typically, this is done automatically: on load, HPVsim checks if the data are already
downloaded, and if not, downloads them using the quick_download() function. The
"slow download" functions supply the files that are usually zipped and stored in
a separate repository, hpvsim_data.
To ensure the data is updated, update the data_version parameter below.
Running this file as a script will remove and then re-download all data.
'''
import os
import sys
import numpy as np
import pandas as pd
import sciris as sc
from hpvsim.data import loaders as ld
# Set parameters
data_version = '1.3' # Data version
data_file = f'hpvsim_data_v{data_version}.zip'
quick_url = f'https://github.com/amath-idm/hpvsim_data/blob/main/{data_file}?raw=true'
age_stem = 'WPP2022_Population1JanuaryBySingleAgeSex_Medium_'
death_stem = 'WPP2022_Life_Table_Abridged_Medium_'
base_url = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/'
years = ['1950-2021', '2022-2100']
__all__ = ['get_data', 'quick_download', 'check_downloaded', 'remove_data']
# Define here to optionally be overwritten
filesdir = ld.filesdir
def set_filesdir(path):
''' Used to change the file folder '''
global filesdir
orig = filesdir
filesdir = path
print(f'Done: filesdir reset from {orig} to {filesdir}')
return
def get_UN_data(label='', file_stem=None, outfile=None, columns=None, force=None, tidy=None):
''' Download data from UN Population Division '''
if force is None: force = False
if tidy is None: tidy = True
sc.heading(f'Getting {label} data...')
T = sc.timer()
dfs = []
# Download data if it's not already in the directory
for year in years:
url = f'{base_url}{file_stem}{year}.zip'
local_base = filesdir/f'{file_stem}{year}'
local_zip = f'{local_base}.zip'
local_csv = f'{local_base}.csv'
if force or not os.path.exists(local_csv):
print(f'\nDownloading from {url}, this may take a while...')
sc.download(url, filename=local_zip)
sc.unzip(local_zip, outfolder=filesdir)
else:
print(f'Skipping {local_csv}, already downloaded')
# Extract the parts used in the model and save
df = pd.read_csv(local_csv, usecols=columns)
dfs.append(df)
if tidy:
print(f'Removing {local_base}')
sc.rmpath(local_zip, die=False)
sc.rmpath(local_csv, die=False)
T.toctic(label=f' Done with {label} for {year}')
# Parse by location
df = pd.concat(dfs)
dd = sc.objdict({l:d for l,d in df.groupby('Location')})
assert dd[0][columns[-1]].dtype != object, "Last column should be numeric type, not mixed or string type"
sc.save(filesdir/outfile, dd)
T.toc(f'Done with {label}')
return dd
def get_age_data(force=None, tidy=None):
''' Import population sizes by age from UNPD '''
columns = ["Location", "Time", "AgeGrpStart", "PopTotal"]
outfile = 'populations.obj'
kw = dict(label='age', file_stem=age_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
return get_UN_data(**kw)
def get_death_data(force=None, tidy=None):
''' Import age-specific death rates and population distributions from UNPD '''
columns = ["Location", "Time", "Sex", "AgeGrpStart", "mx"]
outfile = 'mx.obj'
kw = dict(label='death', file_stem=death_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
return get_UN_data(**kw)
def get_ex_data(force=None, tidy=None):
''' Import age-specific life expectancy and population distributions from UNPD '''
columns = ["Location", "Time", "Sex", "AgeGrpStart", "ex"]
outfile = 'ex.obj'
kw = dict(label='ex', file_stem=death_stem, outfile=outfile, columns=columns, force=force, tidy=tidy)
return get_UN_data(**kw)
def get_birth_data(start=1960, end=2020, force=None, tidy=None):
''' Import crude birth rates from WB '''
sc.heading('Downloading World Bank birth rate data...')
try:
import wbgapi as wb
except Exception as E:
errormsg = 'Could not import wbgapi: cannot download raw data'
raise ModuleNotFoundError(errormsg) from E
T = sc.timer()
birth_rates = wb.data.DataFrame('SP.DYN.CBRT.IN', time=range(start,end), labels=True, skipAggs=True).reset_index()
d = dict()
for country in birth_rates['Country'].unique():
d[country] = birth_rates.loc[(birth_rates['Country']==country)].values[0,3:]
d[country] = d[country].astype(float) # Loaded as an object otherwise!
d['years'] = np.arange(start, end)
sc.save(filesdir/'birth_rates.obj', d)
if tidy:
print(f'Removing {local_path}')
sc.rmpath(local_path, die=False)
T.toc(label='Done with birth data')
return d
def parallel_downloader(which, **kwargs):
''' Function for use with a parallel download function '''
if which in ['age', 'ages']:
get_age_data(**kwargs)
if which in ['birth', 'births']:
get_birth_data(**kwargs)
if which in ['death', 'deaths']:
get_death_data(**kwargs)
if which in ['life_expectancy', 'ex']:
get_ex_data(**kwargs)
return
[docs]
def get_data(serial=False, **kwargs):
''' Download data in parallel '''
sc.heading('Downloading HPVsim data, please be patient...')
T = sc.timer()
if len(sys.argv) > 1:
which = sys.argv[1]
if which not in ['all', 'age', 'ages', 'birth', 'births', 'death', 'deaths']:
errormsg = f'Invalid selection "{which}": must be all, ages, births, or deaths'
raise ValueError(errormsg)
else:
which = 'all'
if which == 'all':
which = ['age', 'birth', 'death', 'life_expectancy']
# Actually download
sc.parallelize(parallel_downloader, which, kwargs=kwargs, serial=serial)
T.toc('Done downloading data for HPVsim')
return
[docs]
def quick_download(verbose=True, init=False):
''' Download pre-processed data files '''
if verbose:
sc.heading('Downloading preprocessed HPVsim data')
if init:
print('Note: this automatic download only happens once, when HPVsim is first run.\n\n')
filepath = sc.makefilepath(filesdir / f'tmp_{data_file}.zip')
sc.download(url=quick_url, filename=filepath, convert=False, verbose=verbose)
sc.unzip(filepath, outfolder=filesdir)
sc.rmpath(filepath)
if verbose:
print('\nData downloaded.')
return
[docs]
def check_downloaded(verbose=1, check_version=True):
'''
Check if data is downloaded. Note: to update data, update the date here and
in data/files/metadata.json.
Args:
verbose (int): detail to print (0 = none, 1 = reason for failure, 2 = everything)
check_version (bool): whether to treat a version mismatch as a failure
'''
# Do file checks
exists = dict()
for key,fn in ld.files.items():
exists[key] = os.path.exists(fn)
if verbose>1:
print(f'HPVsim data: checking {fn}: {exists[key]}')
ok = all(list(exists.values()))
if not ok and verbose:
print(f'HPVsim data: at least one file missing: {exists}')
elif ok and verbose>1:
print('HPVsim data: all files exist')
# Do version check (if files exist)
if ok and check_version:
metadata = sc.loadjson(ld.files.metadata)
match = metadata['version'] == data_version
if verbose:
if not match and verbose:
print(f'HPVsim data: versions do not match ({metadata["version"]} != {data_version})')
elif match and verbose>1:
print(f'HPVsim data: versions match ({data_version})')
ok = ok and match
return ok
[docs]
def remove_data(verbose=True, **kwargs):
''' Remove downloaded data; arguments passed to sc.rmpath() '''
if verbose: sc.heading('Removing HPVsim data files')
for key,fn in ld.files.items():
sc.rmpath(fn, verbose=verbose, **kwargs)
if verbose: print('Data files removed.')
return
if __name__ == '__main__':
ans = input('Are you sure you want to remove and redownload data? y/[n] ')
if ans == 'y':
remove_data()
get_data()
check_downloaded()