Source code for emodpy_hiv.plotting.plot_relationship_end

import pandas as pd

import emodpy_hiv.plotting.xy_plot as xy_plot
import emodpy_hiv.plotting.helpers as helpers

COL_NAME_REL_ID       = "Rel_ID"                 # noqa: E221
COL_NAME_NODE_ID      = "Node_ID"                # noqa: E221
COL_NAME_START_TIME   = "Rel_start_time"         # noqa: E221
COL_NAME_END_TIME_EXP = "Rel_scheduled_end_time"
COL_NAME_END_TYPE_ACT = "Rel_actual_end_time"
COL_NAME_REL_TYPE     = "Rel_type (0 = TRANSITORY; 1 = INFORMAL; 2 = MARITAL; 3 = COMMERCIAL)" # noqa: E221
COL_NAME_OUTSIDE_PFA  = "Is_rel_outside_PFA"     # noqa: E221
COL_NAME_MALE_ID      = "male_ID"                # noqa: E221
COL_NAME_FEMALE_ID    = "female_ID"              # noqa: E221
COL_NAME_MALE_AGE     = "male_age"               # noqa: E221
COL_NAME_FEMALE_AGE   = "female_age"             # noqa: E221
COL_NAME_NUM_ACTS     = "num_total_coital_acts"  # noqa: E221
COL_NAME_TERMINATION  = "Termination_Reason"     # noqa: E221

TMP_COL_NAME_DURATION = "Rel_duration"
TMP_COL_NAME_AVG_DUR  = "Average Duration"       # noqa: E221

TR_NA                 = "NA"                     # noqa: E221
TR_BROKEUP            = "BROKEUP"                # noqa: E221
TR_SELF_MIGRATING     = "SELF_MIGRATING"         # noqa: E221
TR_PARTNER_DIED       = "PARTNER_DIED"           # noqa: E221
TR_PARTNER_TERMINATED = "PARTNER_TERMINATED"     # noqa: E221
TR_PARTNER_MIGRATING  = "PARTNER_MIGRATING"      # noqa: E221


[docs]def extract_data_for_relationship(filename: str,
                                  relationship_type: int):
    """
    Extract the relationship duration information for the given relationship type in the given file.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        filename (str, required):
            The path and name of the RelationshipEnd.csv to be read.

        relationship_type (int, required):
            The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

    Returns:
        Dataframe where the rows must be of the given relationship type and with the extra column
        of the actual relationship duration.
    """
    df = pd.read_csv(filename)

    if COL_NAME_REL_TYPE not in df.columns:
        raise ValueError(f"'{COL_NAME_REL_TYPE}' column does not exist in the file({filename}).")

    if relationship_type not in df[COL_NAME_REL_TYPE].unique():
        raise ValueError(f"'{relationship_type}' is not a valid relationship type in the file({filename}).")

    df = df[ df[COL_NAME_REL_TYPE   ] == relationship_type ]  # noqa: E201, E202
    df = df[ df[COL_NAME_TERMINATION] == TR_BROKEUP        ]  # noqa: E201, E202

    df[TMP_COL_NAME_DURATION] = df[COL_NAME_END_TYPE_ACT] - df[COL_NAME_START_TIME]

    return df


[docs]def plot_relationship_duration_histogram(dir_or_filename: str,
                                         relationship_type: int,
                                         bin_size: float,
                                         expected: list[float] = None,
                                         exp_avg: float = None,
                                         heterogeneity: float = None,
                                         scale: float = None,
                                         show_avg_per_run: bool = False,
                                         img_dir: str = None):
    """
    Plot the relationship duration histogram for the given relationship type and
    show information in the title about the expected Weibull distribution.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the RelationshipEnd.csv files.

        relationship_type (int, required):
            The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

        bin_size (float, required):
            The size of the bins for the histogram.

        expected (list, optional):
            Expected values for the Weibull distribution.  There must be 16 values.

        exp_avg (float, optional):
            Expected average duration in days.  Will be shown in the title.

        heterogeneity (float, optional):
            Heterogeneity parameter for the Weibull distribution.  Will be show in the title.

        scale (float, optional):
            Scale parameter for the Weibull distribution.  Will be show in the title.

        show_avg_per_run (bool, optional):
            Whether to show the average duration per run.  Will be show in the title.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
        None - but image will be saved or window opened.
    """

    # -------------------------------------------------------
    # Create the array of bins given the bin size
    # The bins are bin_size, 2*bin_size, ..., 16*bin_size,
    # They are the maximum value of the bin.
    # I selected 16 bins because it seemed like you saw the distribution well.
    # -------------------------------------------------------
    num_bins = 16
    bins = []
    this_bin = bin_size
    for bin_index in range(num_bins):
        bins.append(this_bin)
        this_bin = this_bin + bin_size

    if (expected is not None) and (len(expected) != 16):
        raise ValueError("The 'expected' Weibull distribution histogram is expected to have 16 values.")

    # ------------------------------
    # Create the labels for the bins
    # ------------------------------
    bin_label_list = []
    for bin_index, this_bin in enumerate(bins):
        if bin_index == 0:
            label = f"0-{this_bin}"
        else:
            label = f"{bins[bin_index-1]}-{this_bin}"
        bin_label_list.append(label)

    # -----------------------------------------
    # Get the list of files in the directory
    # If a single file is given, use that file
    # -----------------------------------------
    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="RelationshipEnd",
                                          file_extension=".csv")

    # --------------------------------------------------------------------------------------
    # Extract the duration data out of each file and determine the histogram
    # of relationship duration.  The histogram is the fraction of relationships in each bin.
    # --------------------------------------------------------------------------------------
    total = 0
    total_count = 0
    histogram_list = []
    for fn in dir_filenames:
        df = extract_data_for_relationship(filename=fn,
                                           relationship_type=relationship_type)
        count_sum = 0
        count_histogram = []
        for this_bin in bins:
            count_histogram.append(0)

        for item in df[TMP_COL_NAME_DURATION]:
            total = total + item
            for bin_index, this_bin in enumerate(bins):
                if (item < this_bin) or (this_bin == bins[len(bins) - 1]):
                    count_histogram[bin_index] = count_histogram[bin_index] + 1
                    count_sum = count_sum + 1
                    break
        histogram = []
        for count in count_histogram:
            histogram.append(count / count_sum)
        histogram_list.append(histogram)
        total_count = total_count + count_sum

    # Calculate average for all relationships in all files
    act_avg = total / total_count

    # ------------------------------------------------------------------------------------
    # Create the dataframe to plot and make the index the bin labels and put the histogram
    # into the data frame.  If not showing the average, then there should be one column for
    # each file.  If showing the average, we want one column with the average of each bin.
    # ------------------------------------------------------------------------------------
    df_hist = pd.DataFrame()
    df_hist["Label"] = bin_label_list
    df_hist.index = df_hist["Label"]
    del df_hist["Label"]

    for hist_index, histogram in enumerate(histogram_list):
        df_hist["Duration-" + str(hist_index)] = histogram

    if show_avg_per_run:
        column_names = df_hist.columns
        df_hist[TMP_COL_NAME_AVG_DUR] = 0
        for column_name in column_names:
            df_hist[TMP_COL_NAME_AVG_DUR] = df_hist[TMP_COL_NAME_AVG_DUR] + df_hist[column_name]
            del df_hist[column_name]
        df_hist[TMP_COL_NAME_AVG_DUR] = df_hist[TMP_COL_NAME_AVG_DUR] / len(dir_filenames)

    # -------------------------------------------------------------------------
    # Create the expected dataframe - Show Weibull distribution that the model
    # should have duplicated.
    # -------------------------------------------------------------------------
    expected_df = None
    if expected:
        expected_df = pd.DataFrame()
        expected_df.index = df_hist.index
        expected_df["Expected Duration"] = expected

    # ------------------------------
    # Create the title for the plot
    # ------------------------------
    rel_str = "TRANSITORY"
    if relationship_type == 1:
        rel_str = "INFORMAL"
    elif relationship_type == 2:
        rel_str = "MARITAL"
    elif relationship_type == 3:
        rel_str = "COMMERCIAL"

    title = ""
    if show_avg_per_run:
        title = title + "Average Duration per Run - "
    title = title + f"Relationship Duration Histogram - {rel_str}"

    title2 = f"Weibull Distribution - Scale={scale:0.2f} - Hetero={heterogeneity:0.2f} - Exp Avg={exp_avg:0.2f} - Act Avg={act_avg:0.2f}"

    # ---------------
    # Create the plot
    # ---------------
    xy_plot.xy_plot(img_dir=img_dir,
                    df=df_hist,
                    expected_df=expected_df,
                    title_1=title,
                    title_2=title2,
                    x_axis_name="Duration (days)",
                    y_axis_name="Fraction of Relationships",
                    show_legend=show_avg_per_run,
                    show_markers=show_avg_per_run,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)


[docs]def plot_relationship_duration_histogram_with_expected(dir_or_filename: str,
                                                       relationship_type: str = "transitory",
                                                       show_avg_per_run: bool = False,
                                                       show_expected: bool = False,
                                                       img_dir: str = None):
    """
    Plot the relationship duration histogram for the given relationship type.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the RelationshipEnd.csv files.

        relationship_type (str, optional):
            The type of relationship. Options: transitory, informal, marital, commercial.
            Default is "transitory".

        show_avg_per_run (bool, optional):
            Whether to show the average duration per run.
            Default is False.

        show_expected (bool, optional):
            Whether to show the expected Weibull distribution.
            Default is False.

        img_dir (str, optional):
            Directory to save the images.  If None, the images will not be saved and a window will be opened.
            Default is none - don't save image and open a window.

    Returns:
        None - but image will be saved or window opened.
    """
    bin_size = None
    expected = None
    hetero   = None  # noqa: E221
    scale    = None  # noqa: E221
    exp_avg  = None  # noqa: E221
    rel_type = None

    # ------------------------------------------------------------------------------------------------
    # Expected array are the values of the Weibull distribution with the given scale and heterogeneity
    # One can generate these expected values using the C++ code in PrngTest.cpp
    # ------------------------------------------------------------------------------------------------
    if relationship_type == "transitory":
        rel_type = 0
        bin_size = 200
        expected = [0.401084, 0.291050, 0.160738, 0.080045,
                    0.037918, 0.016957, 0.007118, 0.003026,
                    0.001270, 0.000482, 0.000210, 0.000071,
                    0.000020, 0.000006, 0.000000, 0.000000]
        hetero = 0.833333333
        scale = 0.956774771214
        exp_avg = 328
    elif relationship_type == "informal":
        rel_type = 1
        bin_size = 200
        expected = [0.159120, 0.196236, 0.174445, 0.140089,
                    0.105469, 0.075556, 0.052367, 0.035005,
                    0.023288, 0.015160, 0.009357, 0.005755,
                    0.003228, 0.002049, 0.001199, 0.000774]
        hetero = 0.75
        scale = 2.03104913138
        exp_avg = 681
    elif relationship_type == "marital":
        rel_type = 2
        bin_size = 1500
        expected = [0.076216, 0.125540, 0.137884, 0.132489,
                    0.119088, 0.100110, 0.081648, 0.063388,
                    0.048465, 0.035221, 0.025370, 0.018314,
                    0.012766, 0.008317, 0.005764, 0.003480]
        hetero = 0.666666667
        scale = 22.154455184937
        exp_avg = 7299
    elif relationship_type == "commercial":
        rel_type = 3
        bin_size = 3
        expected = [0.348380, 0.227677, 0.147723, 0.096667,
                    0.062807, 0.040239, 0.026340, 0.017745,
                    0.011341, 0.007405, 0.004911, 0.002995,
                    0.001882, 0.001295, 0.000901, 0.000646]
        hetero = 1.0
        scale = 0.01917808219
        exp_avg = 7.0
    else:
        raise ValueError(f"Unknown relationship type = {relationship_type}.")

    if not show_expected:
        expected = None

    plot_relationship_duration_histogram(dir_or_filename=dir_or_filename,
                                         relationship_type=rel_type,
                                         bin_size=bin_size,
                                         expected=expected,
                                         exp_avg=exp_avg,
                                         heterogeneity=hetero,
                                         scale=scale,
                                         show_avg_per_run=show_avg_per_run,
                                         img_dir=img_dir)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument('dir_or_filename', type=str, nargs=1, help='A directory with RelationshipEnd.csv files or a single file.')
    parser.add_argument('-o', '--output', default=None, help='If provided, a directory will be created and images saved to the folder.  If not provided, it opens windows.')
    parser.add_argument('-t', '--type_of_relationship', default='marital', help='Options: transitory, informal, marital, commercial')
    parser.add_argument('-m', '--mean', help='Gives the average/mean of each run for that bin.', action='store_true')
    parser.add_argument('-x', '--expected', help='Show the expected Weibull distribution.', action='store_true')

    args = parser.parse_args()

    dir_or_filename = args.dir_or_filename[0]

    possible_relationship_types = ["transitory", "informal", "marital", "commercial"]
    if args.type_of_relationship not in possible_relationship_types:
        raise ValueError("Unknown relationship type = " + args.type_of_relationship)

    plot_relationship_duration_histogram_with_expected(dir_or_filename=dir_or_filename,
                                                       relationship_type=args.type_of_relationship,
                                                       show_avg_per_run=args.mean,
                                                       show_expected=args.expected,
                                                       img_dir=args.output)