Source code for covid19_data_analyzer.data_functions.analysis.factory_functions

from typing import Callable, Dict, Iterable, Tuple, Union
import itertools

import numpy as np
import pandas as pd

import lmfit

from covid19_data_analyzer.data_functions.data_utils import (
    get_fit_param_results_row,
    params_to_df,
)

from covid19_data_analyzer.data_functions.scrapers import ALLOWED_SOURCES, get_data
from covid19_data_analyzer.data_functions.data_utils import (
    get_data_path,
    get_infectious,
)


[docs]def fit_data_model(
    covid19_region_data: pd.DataFrame,
    model: lmfit.Model,
    data_set: str = "confirmed",
    init_params: dict = {},
    free_var_name: str = "x",
) -> Dict[str, Union[lmfit.model.ModelResult, pd.DataFrame]]:
    """
    Generic function to fit lmfit.Model models, onto a regional subset covid data.

    Parameters
    ----------
    covid19_region_data : pd.DataFrame
        covid19 DataFrame for one region
    model : lmfit.Model
        [description]
    data_set : str, optional
        which subdata schold be fitted, need to be of value
        ["confirmed" | "recovered" | deaths], by default "confirmed"
    init_params : dict, optional
        initial parameters for a fit, they depend on the model, by default {}
    free_var_name : str, optional
        name of the free variable used by the model, by default "x"

    Returns
    -------
    Dict[str, Union[lmfit.model.ModelResult, pd.DataFrame]]
        Result dict with keys "model_result" and "plot_data".

        model_result: lmfit.model.ModelResult
            result of the fit, with optimized parameters
        plot_data: pd.DataFrame
            Same as covid19_region_data, but with an resetted index and
            and added fir result

    """
    region_data = covid19_region_data.copy().reset_index(drop=True)
    x = np.arange(region_data.shape[0])
    y = region_data[data_set].values
    result = model.fit(y, **{free_var_name: x, **init_params})
    region_data[f"fitted_{data_set}"] = result.best_fit
    return {"model_result": result, "plot_data": region_data}


[docs]def calc_extrema(
    x: np.ndarray,
    func: Callable,
    param_df: pd.DataFrame,
    func_options: dict = {},
    brute_force_extrema: bool = False,
) -> Tuple[np.ndarray]:
    """
    Calculates the supremum and infimum of a given function func,
    with the parameters and their errors given by param_df, over the values of x.

    Parameters
    ----------
    x : np.ndarray
        Values the supremum and infimum should be calculated over
    func : Callable
        Functions used to calculate the supremum and infimum
    param_df : pd.DataFrame
        DataFrame with parameters and errors
    func_options : dict, optional
        options for func, by default {}
    brute_force_extrema : bool, optional
        Whether or not to calculate supremum and infimum from all permutations
        of adding and subtracting the errors from the parameters.
        For some functions, i.e. the logistic curve, this is needed, since simply
        adding or subtracting the errors from the parameter can lead to supremum and/or
        infimum to cross the result with the exact parameters., by default False

    Returns
    -------
    Tuple[np.ndarray]
        supremum, infimum
    """
    if brute_force_extrema:
        error_permutation_df = pd.DataFrame(
            itertools.product(*zip(param_df.stderr, -param_df.stderr)),
            columns=param_df.index,
        )
        param_permutation_df = error_permutation_df + param_df.value
        result_permutation_df = param_permutation_df.apply(
            lambda params: func(x, **{**params.to_dict(), **func_options}),
            axis=1,
            result_type="expand",
        )
        supremum = result_permutation_df.max()
        infimum = result_permutation_df.min()
    else:
        supremum_params = (param_df.value + param_df.stderr).to_dict()
        infimum_params = (param_df.value - param_df.stderr).to_dict()
        supremum = func(x, **{**supremum_params, **func_options})
        infimum = func(x, **{**infimum_params, **func_options})
    return supremum, infimum


[docs]def predict_trend(
    fit_result: Dict[str, Union[lmfit.model.ModelResult, pd.DataFrame]],
    days_to_predict: int = 30,
    func_options: dict = {},
    param_inverted_stderr: Iterable[str] = [],
    brute_force_extrema: bool = False,
) -> pd.DataFrame:
    """
    Generic function to predict a trend from fitted data

    Parameters
    ----------
    fit_result : Dict[str, Union[lmfit.model.ModelResult, pd.DataFrame]]
        result of fit_data_model or its implementation
    days_to_predict : int, optional
        number of days to predict a trend for, by default 30
    func_options : dict, optional
        options for the function of model, by default {}
    param_inverted_stderr : Iterable[str], optional
        iterable of parameternames with should be inverted,
        to calculate the extrema. , by default []
    brute_force_extrema : bool, optional
        Whether or not to calculate supremum and infimum from all permutations
        of adding and subtracting the errors from the parameters.
        For some functions, i.e. the logistic curve, this is needed, since simply
        adding or subtracting the errors from the parameter can lead to supremum and/or
        infimum to cross the result with the exact parameters., by default False


    Returns
    -------
    pd.DataFrame
        DataFrame with columns "date", "trend", "trend_sup" and "trend_inf"

        date: pd.Datetime
            date of the values
        trend: float
            predicted trend
        trend_sup: float
            supremum of the trend
        trend_inf: float
            infimum of the trend

    See Also
    --------
    fit_data_model
    calc_extrema
    params_to_df
    """
    model_result = fit_result["model_result"]
    func = model_result.model.func
    x = np.arange(model_result.ndata, model_result.ndata + days_to_predict)
    param_df = params_to_df(
        model_result.params, param_inverted_stderr=param_inverted_stderr
    )
    params = param_df.value.to_dict()
    date = fit_result["plot_data"].date.max() + pd.Series(x).apply(
        lambda x: pd.Timedelta(x + 1, unit="D")
    )
    trend = func(x, **{**params, **func_options})
    sup, inf = calc_extrema(
        x,
        func,
        param_df,
        func_options=func_options,
        brute_force_extrema=brute_force_extrema,
    )
    trend_df = pd.DataFrame(
        {"date": date, "trend": trend, "trend_sup": sup, "trend_inf": inf}
    )
    return trend_df.set_index(x)


[docs]def fit_subsets(
    fit_function: Callable,
    covid19_data: pd.DataFrame,
    row: pd.Series,
    subsets: Iterable,
    data_source: str,
    fit_func_kwargs: dict = {},
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Function to fit the subsets of a regional covid data

    Parameters
    ----------
    fit_function : Callable
        Implementation of a model with fit_data_model
    covid19_data : pd.DataFrame
        Full covid19 data from a data_source
    row : pd.Series
        Row of of a dataframe only containing unique value pairs for
        "region" and "parent_region"
    subsets : Iterable
        Iterable of subset names
    data_source : str
        name of the data source, only needed to print debug information
    fit_func_kwargs : dict, optional
        Additional kwargs passed to fit_function, by default {}

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        fitted_region_plot_data, fitted_param_subset

        fitted_region_plot_data:
            actual fitted data, which can be used for plotting
        fitted_param_subset:
            fit parameters for a region

    See Also
    --------
    fit_data_model
    fit_regions
    batch_fit_model
    """
    region = row.region
    parent_region = row.parent_region
    fitted_param_subset = pd.DataFrame()
    fitted_region_plot_data = None
    print(f"Fitting data for: {region}, from {data_source}")
    for subset in subsets:
        try:
            fit_result = fit_function(
                covid19_data, parent_region, region, subset, **fit_func_kwargs
            )
            fit_param_row = get_fit_param_results_row(
                region, parent_region, subset, fit_result
            )
            fitted_param_subset = fitted_param_subset.append(
                fit_param_row, ignore_index=True
            )
            fitted_data_column = f"fitted_{subset}"
            plot_data = fit_result["plot_data"][
                ["date", "region", "parent_region", fitted_data_column]
            ].copy()
            plot_data = plot_data.rename(columns={fitted_data_column: subset})
            if fitted_region_plot_data is None:
                fitted_region_plot_data = plot_data[
                    ["date", "region", "parent_region", subset]
                ]
            else:
                fitted_region_plot_data = pd.merge(
                    fitted_region_plot_data,
                    plot_data,
                    on=["date", "region", "parent_region"],
                )
        except (ValueError, TypeError):
            print(f"Error fitting data for: {region} {subset}, from {data_source}")
    if fitted_region_plot_data is None:
        return pd.DataFrame(), pd.DataFrame()
    else:
        return fitted_region_plot_data, fitted_param_subset


[docs]def fit_regions(
    covid19_data: pd.DataFrame,
    fit_function: Callable,
    data_source: str,
    fit_func_kwargs: dict = {},
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Function to fit all regions of a covid dataset

    Parameters
    ----------
    covid19_data : pd.DataFrame
        Full covid19 data from a data_source
    fit_function : Callable
        Implementation of a model with fit_data_model
    data_source : str
        name of the data source, only needed to print debug information
    fit_func_kwargs : dict, optional
        Additional kwargs passed to fit_function, by default {}

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame]
        fitted_plot_data, fitted_param_results

        fitted_plot_data:
            actual fitted data, which can be used for plotting
        fitted_param_results:
            fit parameters

    See Also
    --------
    fit_subsets
    batch_fit_model
    """
    subset_selector = covid19_data.columns.isin(["confirmed", "deaths", "recovered"])
    subsets = covid19_data.columns[subset_selector]
    regions_df = covid19_data[["region", "parent_region"]].drop_duplicates("region")
    fitted_param_results = pd.DataFrame()
    fitted_plot_data = pd.DataFrame()
    for _, row in regions_df.iterrows():
        fitted_region_plot_data, fitted_param_subset = fit_subsets(
            fit_function=fit_function,
            covid19_data=covid19_data,
            row=row,
            subsets=subsets,
            data_source=data_source,
            fit_func_kwargs=fit_func_kwargs,
        )
        fitted_param_results = fitted_param_results.append(
            fitted_param_subset, ignore_index=True
        )
        fitted_plot_data = fitted_plot_data.append(
            fitted_region_plot_data, ignore_index=True
        )

    get_infectious(fitted_plot_data)

    fitted_param_results.sort_values(["parent_region", "region"], inplace=True)
    fitted_plot_data.sort_values(["date", "parent_region", "region"], inplace=True)
    return fitted_plot_data, fitted_param_results


[docs]def batch_fit_model(
    fit_function: Callable, model_name: str, fit_func_kwargs: dict = {},
) -> None:
    """
    Generic function to fit a fit_function to the data of all data sources and
    save them to file

    Parameters
    ----------
    fit_function : Callable
        Implementation of a model with fit_data_model
    model_name : str
        Name of the model which is fitted, used to generate the path
    fit_func_kwargs : dict, optional
        Additional kwargs passed to fit_function, by default {}

    See Also
    --------
    fit_subsets
    fit_regions
    """
    for data_source in ALLOWED_SOURCES:
        covid19_data = get_data(data_source)
        fitted_plot_data, fitted_param_results = fit_regions(
            covid19_data=covid19_data,
            fit_function=fit_function,
            data_source=data_source,
            fit_func_kwargs=fit_func_kwargs,
        )
        fitted_plot_data_path = get_data_path(
            f"{data_source}/{model_name}_model_fit_plot_data.csv"
        )
        fitted_param_results_path = get_data_path(
            f"{data_source}/{model_name}_model_fit_params.csv"
        )

        fitted_param_results.to_csv(fitted_param_results_path, index=False)
        fitted_plot_data.to_csv(fitted_plot_data_path, index=False)