# convert_csv_to_bin_vector_migration.py
# -----------------------------------------------------------------------------
# This script converts a CSV formatted txt file to an EMOD binary-formatted migration file.
# It also creates the required metadata file.
#
# The CSV file can have several configuration of columns:
#
# 1) Headers: FromNodeID, ToNodeID, Rate (Average # of Trips Per Day)
# If the csv/text file does not have column headers and three entries, this is the format we assume.
# This can be used for human and vector migration. The Rate is for any/all agents regardless of sex or age.
#
# 2) Headers: FromNodeID, ToNodeID, RateMales, RateFemales
# If the csv/text file does not have column headers and four entries, this is the format we assume.
# RateMales are rates for male migration, RateFemales for female migration and are Average # of Trips Per Day.
# This can be used for human and vector migration when using sex-based migration without age.
#
# 3) Headers: FromNodeID, ToNodeID, [], arrays denoting Allele_Combinations
# Allele_Combinations example: [["a1", "a1"], ["b1", "b1"]]; [["X1","Y2"]]; [["*", "a0"], ["X1", "Y1"]]
# Due to use of commas in headers, it is best to use Excel to create them (or look at a sample text csv)
# This is to support VECTOR_MIGRATION_BY_GENETICS.
# Headers are required for this csv file.
# The first (empty, []) array is used as a "default rate" if the vector's genetics doesn't match any of the
# Allele_Combinations. The other column headers denote the rate that the vector will travel at if it matches the
# Allele_Combination listed. Vectors are checked against Allele_Combinations from most-specific, to least-specific,
# regardless of the order in the csv file. Allele_Combinations can, but don't have to, include sex-alleles. Without
# specified sex-alleles, any vector that matches the alleles regardless of sex will travel at that rate.
#
# The FromNodeIDs and ToNodeIDs are the external ID's found in the demographics file.
# Each node ID in the migration file must exist in the demographics file.
# One can have node ID's in the demographics that don't exist in the migration file.
#
# The CSV file does not have to have the same number of entries for each FromNodeID.
# The script will find the FromNodeID that has the most and use that for the
# DestinationsPerNode. The binary file will have DestinationsPerNode entries
# per node.
#
# -----------------------------------------------------------------------------
import collections
import datetime
import json
import os
import struct
import sys
import pandas as pd
from enum import Enum
import ast
[docs]
class GenderDataType(Enum):
SAME_FOR_BOTH_GENDERS = "SAME_FOR_BOTH_GENDERS"
ONE_FOR_EACH_GENDER = "ONE_FOR_EACH_GENDER"
VECTOR_MIGRATION_BY_GENETICS = "VECTOR_MIGRATION_BY_GENETICS"
# -----------------------------------------------------------------------------
# WriteMetadataFile
# -----------------------------------------------------------------------------
[docs]
def is_number(value):
try:
float(value) # Attempt to convert to a float
return True
except ValueError:
return False
[docs]
def get_summary_data(metadata):
# ----------------------------
# collect data from CSV file
# ----------------------------
data_df = metadata.data_df
metadata.num_columns = len(data_df.iloc[0])
if metadata.num_columns < 3:
raise ValueError(f"There are {metadata.num_columns} in the file, but we expect at least three. "
f"Please review comments for expected column configurations and try again.")
if is_number(data_df.columns[0]): # no column headers
if metadata.num_columns == 3:
print(f"File doesn't seem to have headers, and with {metadata.num_columns} columns, "
"we are assuming 'FromNodeID', 'ToNodeID', 'Rate' column configuration.")
metadata.gender_data_type = GenderDataType.SAME_FOR_BOTH_GENDERS
elif metadata.num_columns == 4:
print(f"File doesn't seem to have headers, and with {metadata.num_columns} columns, "
"we are assuming 'FromNodeID', 'ToNodeID', 'RateMales', 'RateFemales' column configuration.")
metadata.gender_data_type = GenderDataType.ONE_FOR_EACH_GENDER
else:
raise ValueError(f"File doesn't seem to have headers, and with {metadata.num_columns} columns, it is not "
f"obvious what the column configuration should be. If you are trying to create a "
f"VECTOR_MIGRATION_BY_GENETICS file, please add headers as shown in the comments.")
else: # has headers, force user to use one of the three formats
headers = data_df.columns
if 'FromNodeID' not in headers[0] or 'ToNodeID' not in headers[1]:
raise ValueError(f"With headers, we expect first two column headers to be 'FromNodeID', 'ToNodeID', but "
f"they are {headers[0]} and {headers[1]}.")
elif metadata.num_columns == 3:
if 'Rate' not in headers[2]:
raise ValueError(f"With headers and {metadata.num_columns}, we expect the third column to be 'Rate', "
f"but it is {headers[2]}, please check and correct.")
else:
metadata.gender_data_type = GenderDataType.SAME_FOR_BOTH_GENDERS
elif metadata.num_columns > 3:
if "[]" in headers[2]:
metadata.gender_data_type = GenderDataType.VECTOR_MIGRATION_BY_GENETICS
for alleles in data_df.columns.tolist()[2:]:
metadata.allele_combinations.append(ast.literal_eval(alleles))
elif metadata.num_columns == 4:
if 'RateMales' in headers[2] and 'RateFemales' in headers[3]:
metadata.gender_data_type = GenderDataType.ONE_FOR_EACH_GENDER
else:
raise ValueError(f"With column headers and {metadata.num_columns} and not Allele_Combinations in "
f"headers, we expect 'RateMales' in third column and 'RateFemales' in forth "
f"column, but they are"
f" {headers[2]} and {headers[3]}, please check and correct.")
else:
raise ValueError(f"File has column headers with {metadata.num_columns} columns, but does not have the "
f"expected headers. Please review the headers expected in the comments, correct, "
f"and try again.")
# -------------------------------------------------------------------------
# Find the list node that individuals can migrate from
# Also find the maximum number of nodes that one can go to from a give node.
# This max is used in determine the layout of the binary data.
# -------------------------------------------------------------------------
from_node_id_list = data_df[data_df.columns[0]].unique().tolist()
for from_node_id in from_node_id_list:
to_node_id_list = data_df[data_df.iloc[:, 0] == from_node_id].iloc[:, 1]
if len(to_node_id_list) != len(to_node_id_list.unique()):
raise ValueError(f"For 'FromNodeID' = {from_node_id}, there are non-unique 'ToNodeIDs'.")
if len(to_node_id_list) > metadata.max_destinations_per_node:
metadata.max_destinations_per_node = len(to_node_id_list)
# -------------------------------------------------------------------
# Create NodeOffsets string
# This contains the location of each From Node's data in the bin file
# -------------------------------------------------------------------
for from_node_id in from_node_id_list:
metadata.offset_str += '%0.8X' % from_node_id
metadata.offset_str += '%0.8X' % (metadata.node_count * metadata.max_destinations_per_node * 12) # 12 -> sizeof(uint32_t) + sizeof(double)
metadata.node_count += 1
# return metadata
# -----------------------------------------------------------------------------
# WriteBinFileGender
# -----------------------------------------------------------------------------
[docs]
def write_bin_file(metadata):
bin_file = open(metadata.filename_out, 'wb')
data_df = metadata.data_df
from_node_id_list = data_df[data_df.columns[0]].unique().tolist()
for data_index in range(2, metadata.num_columns):
for from_node_id in from_node_id_list:
# Initialize with zeros
to_node_id_array = [0] * metadata.max_destinations_per_node
rates_array = [0] * metadata.max_destinations_per_node
# Populate arrays with data
to_node_id_list = data_df[data_df.iloc[:, 0] == from_node_id].iloc[:, 1].tolist()
for index, to_node_id in enumerate(to_node_id_list):
rate_list = data_df[(data_df.iloc[:, 0] == from_node_id) & (data_df.iloc[:, 1] == to_node_id)].iloc[:, data_index].to_list()
if len(rate_list) > 1: # should only have one entry for the to/from node id combination
raise ValueError(f"For FromNodeID {from_node_id} and ToNodeID {to_node_id}, there are multiple "
f"rates in column index {data_index}.")
to_node_id_array[index] = to_node_id
rates_array[index] = rate_list[0] # get the one rate
# Format data into binary
bin_data_id = struct.pack('I' * len(to_node_id_array), *to_node_id_array)
bin_data_rt = struct.pack('d' * len(rates_array), *rates_array)
bin_file.write(bin_data_id)
bin_file.write(bin_data_rt)
bin_file.close()
if __name__ == "__main__":
if not (len(sys.argv) == 2 or len(sys.argv) == 3):
print('\nUsage: %s [input-migration-csv] [idreference(optional)]' % os.path.basename(sys.argv[0]))
exit(0)
filename_in = sys.argv[1]
id_ref = "temp_id_reference"
if len(sys.argv) == 3:
id_ref = sys.argv[2]
meta_data = MetaData()
meta_data.ref_id = id_ref
meta_data.data_df = pd.read_csv(filename_in)
meta_data.filename_out = filename_in.split(".")[0] + ".bin"
get_summary_data(meta_data)
write_bin_file(meta_data)
write_metadata_file(meta_data)
print(f"max_destinations_per_node = {meta_data.max_destinations_per_node}")
print(f"Finished converting {filename_in} to {meta_data.filename_out} and +.json metadata file.")
if len(sys.argv) == 2:
print(f"IdReference in {meta_data.filename_out}.json file is set to a temporary value, please update it"
f" to match your demographics.")