Source code for emodpy_malaria.weather.weather_data

#!/usr/bin/env python3

"""
Weather data module implementing functionality for working with binary weather files (.bin.json).
"""

from __future__ import annotations

import numpy as np
import pandas as pd

from pathlib import Path
from typing import Dict, Iterable, List, NoReturn, Tuple, Union


from emodpy_malaria.weather.weather_utils import hash_series, invert_dict, make_path
from emodpy_malaria.weather.weather_variable import WeatherVariable
from emodpy_malaria.weather.weather_metadata import WeatherMetadata, WeatherAttributes, SERIES_BYTE_VALUE_SIZE



[docs]
class WeatherData:
    """
    Functionality for working with binary weather files (.bin.json).
    """
    def __init__(self, data: np.ndarray, metadata: WeatherMetadata = None):
        """
        Instantiate a weather object from data numpy array and a weather metadata object.

        Args:
            data: Numpy array of unique weather time series, in the order they appear in a .bin file.
                  Shape can either be a single dimension array, or a 2d array having series stored as rows.
                  This means that the number of rows corresponds to the number of unique series and number of
                  columns corresponds to a series length (e.g. 365).
            metadata: (Optional) WeatherMetadata object containing metadata from .bin.json.
        """
        data = self._ensure_data_type(data)
        self._data: np.ndarray = data

        if metadata is not None:
            # If metadata is provided ensure data shape matches metadata info
            self._metadata = metadata
            expected_shape = self._expected_shape()
            if data.shape != expected_shape:
                self._data = data.reshape(expected_shape)
        else:
            # If metadata object is not provided data must be in the correct shape.
            self._metadata = WeatherMetadata(node_ids=list(range(1, data.shape[0] + 1)), series_len=data.shape[1])

        self.validate()

    def __eq__(self, other: WeatherData):
        """Equality operator for WeatherData objects."""
        meta_eq = self.metadata == other.metadata
        data_eq = np.array_equal(self.data, other.data)
        return meta_eq and data_eq

    def _expected_shape(self) -> Tuple[int, int]:
        """Returns the expected shape of data numpy array based on series count and len. """
        return self.metadata.series_unique_count, self.metadata.series_len


[docs]
    def validate(self):
        """Validate data and metadata relationship."""
        expected_shape = self._expected_shape()
        assert self._data.shape == expected_shape, "Data numpy array shape is not matching metadata counts."


    @property
    def metadata(self) -> WeatherMetadata:
        """Metadata property, exposing weather metadata object."""
        return self._metadata

    @property
    def data(self) -> np.ndarray:
        """Raw data, reshaped in one row per node weather time series."""
        return self._data

    # Import/Export members


[docs]
    @classmethod
    def from_dict(cls,
                  node_series: Dict[int, Union[np.ndarray[np.float32], List[float]]],
                  same_nodes: Dict[int, List[int]] = None,
                  attributes: WeatherAttributes = None) -> WeatherData:
        """
        Creates a WeatherData object from a dictionary mapping nodes and node weather time series.
        The method identifies unique node weather time series and produces a corresponding node-offset dictionary.

        Args:
            node_series: Dictionary with node ids as keys and weather time series as values (don't have to be unique).
            same_nodes: (Optional) Dictionary, mapping nodes from 'node_series' dictionary to additional nodes
                                   which series are the same. Keys are node ids, values are lists of node ids.
            attributes: (Optional) Attributes used to initiate weather metadata. If not provided, defaults are used.

        Returns:
            WeatherData object.
        """
        # Initialize
        if not isinstance(node_series, Dict) or len(node_series) == 0:
            exception = TypeError if not isinstance(node_series, Dict) else ValueError
            raise exception("The node time series argument must be a non-empty dictionary of node ids and time series.")

        # Check weather time series by converting to an array and validate shape
        try:
            series_values = np.array(list(node_series.values()), dtype=np.float32)
        except ValueError as ex:
            raise ValueError("Time series contains values which are not numbers.")

        if np.any(np.isinf(np.abs(series_values))):
            raise ValueError("Time series contains 'inf' values which indicates failed conversion into np.float32.")

        if len(series_values.shape) != 2:
            raise ValueError("Time series must be a non-empty lists or array of float or integer values. "
                             "All time series must be of the same length.")

        # Check there are no NaN values in node ids
        if any(np.isnan(list(node_series))):
            raise ValueError(f"Node id list contains 'NaN' values.")

        # Check there are no NaN values in weather time series
        if any(np.isnan(series_values.reshape(-1))):    #
            raise ValueError("Time series contains 'NaN' values.")

        same_nodes = same_nodes or {}

        # Identify unique node weather time series, make sure node ids are int.
        node_series_hashes = {int(n): hash_series(s) for n, s in node_series.items()}       # Create node->hash dict
        unique_nodes = {h: nn[0] for h, nn in invert_dict(node_series_hashes).items()}      # Invert into hash->nodes
        unique_series = [node_series[n] for n in unique_nodes.values()]                     # List unique time series

        # Calculate offset increment per node as time series length x number of bytes per value
        offset_increment = series_values.shape[1] * SERIES_BYTE_VALUE_SIZE
        # Create node->offset dict, for nodes with unique weather time series
        node_offsets = {n: (i * offset_increment) for i, n in enumerate(unique_nodes.values())}
        # Update node->offset dict, add nodes sharing same offsets
        node_offsets.update({n: node_offsets[unique_nodes[h]] for n, h in node_series_hashes.items()})

        # Add other nodes, if specified
        # Invert dict from "unique node"->"list of nodes with that same offset" to "...same..."->"unique node"
        same_nodes = invert_dict(same_nodes, single_value=True)
        node_offsets.update({same: node_offsets[unique] for same, unique in same_nodes.items()})
        # Sort by node, offset
        node_offsets = dict(sorted(node_offsets.items()))

        # Convert the list of weather timeseries into a NumPy array and init WeatherMetadata and WeatherData objects
        data = np.array(unique_series, dtype=np.float32)
        wm = WeatherMetadata(node_ids=node_offsets, series_len=data.shape[1], attributes=attributes)
        wd = WeatherData(data=data, metadata=wm)

        return wd



[docs]
    def to_dict(self, only_unique_series=False, copy_data: bool = True) -> Dict[int, np.ndarray[np.float32]]:
        """
        Create a node-to-series dictionary from the current object. This method can be used to edit weather data.

        Args:
            only_unique_series: (Optional) A flag controlling whether the output dictionary will contain series for all
                                nodes (if set to true) or only unique series.
            copy_data: (Optional) Flag indicating whether to copy data numpy array to prevent unintentional changes.
        Returns:
            A dictionary with node ids and keys and node weather time series as values.
        """
        data_dict = {}
        node_groups = self.metadata.offset_nodes.values()
        series_list = np.copy(self._data) if copy_data else self._data
        for ng, s in zip(node_groups, series_list):
            ng = ng[:1] if only_unique_series else ng
            data_dict.update(dict(zip(ng, [s] * len(ng))))

        data_dict = dict(sorted(data_dict.items()))

        return data_dict



[docs]
    @classmethod
    def from_csv(cls, file_path: Union[str, Path], info: DataFrameInfo = None, attributes: WeatherAttributes = None) -> WeatherData:
        """
        Creates a WeatherData object from a csv file. Used for creating or editing weather files.
        The method identifies unique node weather time series and produces a corresponding node-offset dictionary.

        Args:
            file_path: The csv file path from which weather data is loaded (expected columns: node, step, value).
            info: (Optional) Dataframe info object describing dataframe columns and content.
            attributes: (Optional) Attributes used to initiate weather metadata. If not provided, defaults are used.

        Returns:
            WeatherData object.
        """
        assert Path(file_path).is_file(), f"Weather file not found: {file_path}."
        df = pd.read_csv(file_path)
        wd = cls.from_dataframe(df, info=info, attributes=attributes)
        return wd



[docs]
    def to_csv(self, file_path: Union[str, Path], info: DataFrameInfo = None) -> pd.DataFrame:
        """
        Creates a csv file and stores node ids, time steps and weather node weather time series as separate columns.

        Args:
            file_path: The csv file path into which weather data will be stored.
            info: (Optional) Dataframe info object describing dataframe columns and content.

        Returns:
            Dataframe created as an intermediate object used to save data to a csv file.
        """
        make_path(Path(file_path).parent)
        df = self.to_dataframe(info=info)
        df.to_csv(file_path, index=False)
        return df



[docs]
    @classmethod
    def from_dataframe(cls,
                       df: pd.DateFrame,
                       info: DataFrameInfo = None,
                       attributes: WeatherAttributes = None) -> WeatherData:
        """
        Creates WeatherData object from the Pandas dataframe. The dataframe is expected to contain
        node ids, time steps and weather node weather time series as separate columns.

        Args:
            df: Dataframe containing nodes and weather time series (expected columns: node, step, value).
            info: (Optional) Dataframe info object describing dataframe columns and content.
            attributes: (Optional) Attributes used to initiate weather metadata. If not provided, defaults are used.

        Returns:
            WeatherData object.
        """
        if not isinstance(df, pd.DataFrame) or len(df) == 0:
            exception = TypeError if not isinstance(df, pd.DataFrame) else ValueError
            raise exception("df argument must be a non-empty pandas DataFrame")

        info = info or DataFrameInfo.detect_columns(df=df)
        nc, sc, vc = [info.node_column, info.step_column, info.value_column]

        # Test for "nan" values in target columns.
        for c in [nc, sc, vc]:
            if df[c].hasnans:
                raise ValueError(f"Column {c} contains 'NaN' values.")

        df = df[[nc, sc, vc]].sort_values(by=[nc, sc])
        df = df[[nc, vc]].set_index(nc)
        node_series = df.groupby(nc).apply(lambda r: r.to_dict('records')).to_dict()
        node_series = {node: [list(d.values())[0] for d in rw] for node, rw in node_series.items()}

        wd = cls.from_dict(node_series=node_series, attributes=attributes)
        return wd



[docs]
    def to_dataframe(self, info: DataFrameInfo = None) -> pd.DataFrame:
        """
        Creates a dataframe containing node ids, time steps and weather time series as separate columns.

        Args:
            info: (Optional) Dataframe info object describing dataframe columns and content.

        Returns:
            Dataframe containing node ids and weather time series.
        """
        info = info or DataFrameInfo()
        data_dict = self.to_dict(only_unique_series=info.only_unique_series)

        actual_nodes = list(data_dict.keys())
        series_len = self.metadata.series_len
        nodes = np.repeat(actual_nodes, series_len)
        steps = list(range(1, series_len + 1)) * len(actual_nodes)
        values = np.array(list(data_dict.values())).reshape(len(data_dict) * self.metadata.series_len)

        assert len(nodes) == len(steps) == len(values), "Dataframe series lengths don't match"
        # assert all([n == nodes[0] for n in nodes[:series_len] and  == steps[series_len:series_len + 1],
        if len(data_dict) > 1:  # Skip validation in case of a single node.
            assert steps[:series_len] == steps[series_len:series_len * 2], "Steps series is not valid."

        column_series_dict = {info.node_column: nodes, info.step_column: steps, info.value_column: values}
        df = pd.DataFrame(column_series_dict)
        # Set data types
        df[info.node_column] = df[info.node_column].astype(int)
        df[info.step_column] = df[info.step_column].astype(int)
        df[info.value_column] = df[info.value_column].astype(np.float32)
        df.sort_values(by=[info.node_column, info.step_column])
        return df



[docs]
    @classmethod
    def from_file(cls, file_path: Union[str, Path]) -> WeatherData:
        """
        Create WeatherData object by reading weather data from binary (.bin) and metadata (.bin.json) files.

        Args:
            file_path: The weather binary (.bin) file path. The metadata file path is constructed by appending ".json".

        Returns:
            WeatherData object.
        """
        file_path = str(file_path)
        wm: WeatherMetadata = WeatherMetadata.from_file(f"{file_path}.json")
        assert Path(file_path).is_file(), f"Data file not found: {file_path}."
        data = np.fromfile(file_path, dtype=np.float32)
        data_len = len(data)
        msg = f"Data length {data_len} doesn't match metadata"
        msg += f" ({wm.series_count} * {wm.series_len} = {wm.total_value_count})"
        assert wm.total_value_count == data_len, msg
        wd = WeatherData(data=data, metadata=wm)
        return wd



[docs]
    def to_file(self, file_path: Union[str, Path]) -> NoReturn:
        """
        Create weather binary (.bin) and metadata (.json) files, containing weather data and metadata.

        Args:
            file_path: The weather binary (.bin) file path. The metadata file path is constructed by adding ".json".

        Returns:
            None.
        """
        file_path = str(file_path)
        self.validate()
        make_path(Path(file_path).parent)
        self._ensure_data_type(self._data)
        with open(file_path, "wb") as bf:
            self._data.reshape(self.metadata.total_value_count).tofile(bf)

        self._metadata.to_file(f"{file_path}.json")


    @classmethod
    def _ensure_data_type(cls, data: Iterable) -> np.ndarray[np.float32]:
        """
        Ensures node weather time series is of the type compatible with weather binary file format.
        The method validates the data object is iterable and if needed it converts it to the NumPy float32 array.

        Args:
            data: Iterable object containing node weather time series. Usually a list or array of floats values.

        Returns:
            Node weather time series as a NumPy float32 array.
        """
        is_iter_ok = isinstance(data, Iterable) and len(list(data)) > 0
        assert data is not None and is_iter_ok, "Data must have at least one item"
        data = np.array(data, dtype=np.float32)
        return data




[docs]
class DataFrameInfo:
    """
    The object containing info about dataframe columns and content. Used to pass dataframe info between methods
    working with weather dataframes.
    """
    _variable_values = [str(v.value).lower() for v in WeatherVariable.list()]
    _default_column_candidates = {
        "node": ["nodes", "node", "node_id", "node_ids", "nodeid", "id", "ids"],
        "step": ["steps", "step", "time"],
        "value": ["values", "value", "series", "data"] + _variable_values}

    def __init__(self,
                 node_column: str = None,
                 step_column: str = None,
                 value_column: str = None,
                 only_unique_series: bool = False):
        """
        Initializes dataframe info object. If no info is provided the defaults are used.

        Args:
            node_column: (Optional) Node column name. The default is "nodes".
            step_column: (Optional) Step column name.
            value_column: (Optional) Value column name.
            only_unique_series: (Optional) Flag indicating weather only distinct weather time series are needed.
        """
        self._node_column: str = node_column
        self._step_column: str = step_column
        self._value_column: str = value_column
        self.only_unique_series = only_unique_series
        self._set_defaults()

    def __str__(self) -> str:
        """String representation used to print or debug DataFrameInfo objects."""
        return str(self.__dict__.values())

    def __eq__(self, other: DataFrameInfo):
        """Equality operator for DataFrameInfo objects."""
        if other is None:
            return False

        cols_eq = self._node_column == other.node_column and self._step_column == other.step_column
        cols_eq = cols_eq and self._value_column == other.value_column
        is_eq = cols_eq and self.only_unique_series == other.only_unique_series
        return is_eq

    @property
    def node_column(self):
        return self._node_column

    @property
    def step_column(self):
        return self._step_column

    @property
    def value_column(self):
        return self._value_column

    def _set_defaults(self) -> DataFrameInfo:
        """Create a dataframe info object and initialize variables with defaults."""
        self._node_column = self.node_column or self._default_column_candidates["node"][0]
        self._step_column = self.step_column or self._default_column_candidates["step"][0]
        self._value_column = self.value_column or self._default_column_candidates["value"][0]
        return self


[docs]
    @classmethod
    def detect_columns(cls, df, column_candidates: Dict[str, List[str]] = None) -> DataFrameInfo:
        """
        Auto-detect required column names (nodes, time-steps and weather time series) for the DataFrameInfo object.

        Args:
            df: The dataframe containing nodes, time-steps and weather time series.
            column_candidates: (Optional) Dictionary of candidate column names to be used instead of defaults.

        Returns:
            DataFrameInfo object with detected column names.
        """
        column_candidates = column_candidates or cls._default_column_candidates

        # Detect columns
        column_types = ["node", "step", "value"]
        columns = [cls._detect_column(df, column_candidates[name]) for name in column_types]
        if not all(columns):
            not_found = [name for name, col in zip(column_types, columns) if col in None]
            raise NameError(f"Unable to detect columns {not_found}.")

        info = DataFrameInfo(*columns)
        return info


    @staticmethod
    def _detect_column(df, column_candidates):
        """
        Detect which of the candidate column names is used in the given dataframe.

        Args:
            df: The dataframe containing nodes, time-steps and weather time series.
            column_candidates: (Optional) Dictionary of candidate column names to be used instead of defaults.

        Returns:
            The detected column name.
        """
        cols = [c for c in df.columns.values if str(c).strip().lower() in column_candidates]
        found_col = None if len(cols) == 0 else cols[0]
        assert found_col is not None and found_col in df.columns.values, "Unable to detect node column."
        return found_col