Skip to content

The Two Stages Procedure of Meir and Gorfine (2023) - Exact

pydts.fitters.TwoStagesFitterExact() ¤

Bases: TwoStagesFitter

Source code in src/pydts/fitters.py
def __init__(self):
    super().__init__()
    self.beta_models_params_attr = 'params'

alpha_df = pd.DataFrame() instance-attribute ¤

beta_models = {} instance-attribute ¤

beta_models_params_attr = 'params' instance-attribute ¤

covariates = None instance-attribute ¤

duration_col = None instance-attribute ¤

event_models = {} instance-attribute ¤

event_type_col = None instance-attribute ¤

events = None instance-attribute ¤

expanded_df = pd.DataFrame() instance-attribute ¤

formula = None instance-attribute ¤

pid_col = None instance-attribute ¤

times = None instance-attribute ¤

_alpha_jt(x, df, y_t, beta_j, n_jt, t, event) ¤

Source code in src/pydts/fitters.py
def _alpha_jt(self, x, df, y_t, beta_j, n_jt, t, event):
    # Alpha_jt optimization objective
    partial_df = df[df[self.duration_col] >= t]
    if isinstance(self.covariates, list):
        expit_add = np.dot(partial_df[self.covariates], beta_j)
    elif isinstance(self.covariates, dict):
        expit_add = np.dot(partial_df[self.covariates[event]], beta_j)
    else:
        raise ValueError
    return ((1 / y_t) * np.sum(expit(x + expit_add)) - (n_jt / y_t)) ** 2

_expand_data(df, event_type_col, duration_col, pid_col) ¤

This method expands the raw data as explained in Lee et al. 2018

Parameters:

Name Type Description Default
df DataFrame

Dataframe to expand.

required
event_type_col str

The event type column name (must be a column in df), Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.

required
duration_col str

Last follow up time column name (must be a column in df).

required
pid_col str

Sample ID column name (must be a column in df).

required

Returns:

Type Description
DataFrame

Expanded df (pandas.DataFrame): the expanded dataframe.

Source code in src/pydts/base_fitters.py
def _expand_data(self,
                 df: pd.DataFrame,
                 event_type_col: str,
                 duration_col: str,
                 pid_col: str) -> pd.DataFrame:
    """
    This method expands the raw data as explained in Lee et al. 2018

    Args:
        df (pandas.DataFrame): Dataframe to expand.
        event_type_col (str): The event type column name (must be a column in df),
                              Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).

    Returns:
        Expanded df (pandas.DataFrame): the expanded dataframe.
    """
    self._validate_cols(df, event_type_col, duration_col, pid_col)
    return get_expanded_df(df=df, event_type_col=event_type_col, duration_col=duration_col, pid_col=pid_col)

_fit_beta(expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}) ¤

Source code in src/pydts/fitters.py
def _fit_beta(self, expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}):
    # Model fitting for conditional estimation of Beta_j for all events
    _model_kwargs_per_event = np.any([event in model_kwargs.keys() for event in events])
    _model_fit_kwargs_per_event = np.any([event in model_fit_kwargs.keys() for event in events])
    beta_models = {}
    for event in events:
        _model_kwargs = model_kwargs[event] if _model_kwargs_per_event else model_kwargs
        _model_fit_kwargs = model_fit_kwargs[event] if _model_fit_kwargs_per_event else model_fit_kwargs
        beta_models[event] = self._fit_event_beta(expanded_df=expanded_df, event=event,
                                                  model=model, model_kwargs=_model_kwargs,
                                                  model_fit_kwargs=_model_fit_kwargs)
    return beta_models

_fit_event_beta(expanded_df, event, model=ConditionalLogit, model_kwargs={}, model_fit_kwargs={}) ¤

Source code in src/pydts/fitters.py
def _fit_event_beta(self, expanded_df, event, model=ConditionalLogit, model_kwargs={}, model_fit_kwargs={}):
    # Model fitting for conditional estimation of Beta_j for specific event
    if isinstance(self.covariates, dict):
        _covs = self.covariates[event]
    else:
        _covs = self.covariates

    beta_j_model = ConditionalLogit(endog=expanded_df[f'j_{event}'],
                                    exog=expanded_df[_covs],
                                    groups=expanded_df[self.duration_col],
                                    **model_kwargs)

    if ('alpha' in model_fit_kwargs.keys()):
        # Use 0 <= L1_wt <= 1 parameter to switch between L2 (L1_wt = 0) and L1 (L1_wt = 1) or elastic net.
        # alpha is the the penalty weight.
        beta_j_model = beta_j_model.fit_regularized(**model_fit_kwargs)
    else:
        beta_j_model = beta_j_model.fit(**model_fit_kwargs)

    return beta_j_model

_hazard_inverse_transformation(a) ¤

This function defines the inverse transformation of the hazard function such that \(\lambda_j (t | Z) = h^{-1} ( lpha_{jt} + Z^{T} eta_{j} )\)

Parameters:

Name Type Description Default
a Union[int, array, Series, DataFrame]
required

Returns:

Name Type Description
i Union[int, array, Series, DataFrame]

the inverse function applied on a. $ h^{-1} (a) $

Source code in src/pydts/fitters.py
def _hazard_inverse_transformation(self, a: Union[int, np.array, pd.Series, pd.DataFrame]) -> \
        Union[int, np.array, pd.Series, pd.DataFrame]:
    """
    This function defines the inverse transformation of the hazard function such that $\lambda_j (t | Z) = h^{-1} ( \alpha_{jt} + Z^{T} \beta_{j} )$

    Args:
        a (Union[int, np.array, pd.Series, pd.DataFrame]):

    Returns:
        i (Union[int, np.array, pd.Series, pd.DataFrame]): the inverse function applied on a. $ h^{-1} (a) $
    """
    i = expit(a)
    return i

_hazard_transformation(a) ¤

This function defines the transformation of the hazard function such that $ h ( \lambda_j (t | Z) ) = lpha_{jt} + Z^{T} eta_{j} $

Parameters:

Name Type Description Default
a Union[int, array, Series, DataFrame]
required

Returns:

Name Type Description
i Union[int, array, Series, DataFrame]

the inverse function applied on a. $ h^{-1} (a)$

Source code in src/pydts/fitters.py
def _hazard_transformation(self, a: Union[int, np.array, pd.Series, pd.DataFrame]) -> \
        Union[int, np.array, pd.Series, pd.DataFrame]:
    """
    This function defines the transformation of the hazard function such that $ h ( \lambda_j (t | Z) ) = \alpha_{jt} + Z^{T} \beta_{j} $

    Args:
        a (Union[int, np.array, pd.Series, pd.DataFrame]):

    Returns:
        i (Union[int, np.array, pd.Series, pd.DataFrame]): the inverse function applied on a. $ h^{-1} (a)$
    """

    i = logit(a)
    return i

_validate_cols(df, event_type_col, duration_col, pid_col) ¤

Source code in src/pydts/base_fitters.py
def _validate_cols(self, df, event_type_col, duration_col, pid_col):
    assert event_type_col in df.columns, f'Event type column is missing from df: {event_type_col}'
    assert duration_col in df.columns, f'Duration column is missing from df: {duration_col}'
    assert pid_col in df.columns, f'Observation ID column is missing from df: {pid_col}'

_validate_covariates_in_df(df) ¤

Source code in src/pydts/base_fitters.py
def _validate_covariates_in_df(self, df):
    cov_not_fitted = []
    if isinstance(self.covariates, list):
        cov_not_fitted = [cov for cov in self.covariates if cov not in df.columns]
    elif isinstance(self.covariates, dict):
        for event in self.events:
            event_cov_not_fitted = [cov for cov in self.covariates[event] if cov not in df.columns]
            cov_not_fitted.extend(event_cov_not_fitted)
    assert len(cov_not_fitted) == 0, \
        f"Cannot predict - required covariates are missing from df: {cov_not_fitted}"

_validate_t(t, return_iter=True) ¤

Source code in src/pydts/base_fitters.py
def _validate_t(self, t, return_iter=True):
    _t = np.array([t]) if not isinstance(t, Iterable) else t
    t_i_not_fitted = [t_i for t_i in _t if (t_i not in self.times)]
    assert len(t_i_not_fitted) == 0, \
        f"Cannot predict for times which were not included during .fit(): {t_i_not_fitted}"
    if return_iter:
        return _t
    return t

evaluate(test_df, oracle_col='T', **kwargs) ¤

Source code in src/pydts/base_fitters.py
def evaluate(self, test_df: pd.DataFrame, oracle_col: str = 'T', **kwargs) -> float:
    raise NotImplementedError

fit(df, covariates=None, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, x0=0, fit_beta_kwargs={}, verbose=2, nb_workers=WORKERS) ¤

This method fits a model to the discrete data.

Parameters:

Name Type Description Default
df DataFrame

training data for fitting the model

required
covariates list

list of covariates to be used in estimating the regression coefficients

None
event_type_col str

The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.

'J'
duration_col str

Last follow up time column name (must be a column in df).

'X'
pid_col str

Sample ID column name (must be a column in df).

'pid'
skip_expansion boolean

Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded. When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.

False
x0 (Union[array, int], Optional)

initial guess to pass to scipy.optimize.minimize function

0
fit_beta_kwargs (dict, Optional)

Keyword arguments to pass on to the estimation procedure.

{}
verbose (int, Optional)

The verbosity level of pandaallel

2
nb_workers (int, Optional)

The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.

WORKERS

Returns:

Name Type Description
event_models dict

Fitted models dictionary. Keys - event names, Values - fitted models for the event.

Source code in src/pydts/fitters.py
def fit(self,
        df: pd.DataFrame,
        covariates: Union[list, dict] = None,
        event_type_col: str = 'J',
        duration_col: str = 'X',
        pid_col: str = 'pid',
        skip_expansion: bool = False,
        x0: Union[np.array, int] = 0,
        fit_beta_kwargs: dict = {},
        verbose: int = 2,
        nb_workers: int = WORKERS) -> dict:
    """
    This method fits a model to the discrete data.

    Args:
        df (pd.DataFrame): training data for fitting the model
        covariates (list): list of covariates to be used in estimating the regression coefficients
        event_type_col (str): The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).
        skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded. When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
        x0 (Union[numpy.array, int], Optional): initial guess to pass to scipy.optimize.minimize function
        fit_beta_kwargs (dict, Optional): Keyword arguments to pass on to the estimation procedure.
        verbose (int, Optional): The verbosity level of pandaallel
        nb_workers (int, Optional): The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.

    Returns:
        event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.
    """

    self._validate_cols(df, event_type_col, duration_col, pid_col)
    self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
    if (covariates is not None):
        cov_not_in_df = []
        if isinstance(covariates, list):
            cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
        elif isinstance(covariates, dict):
            for event in self.events:
                event_cov_not_in_df = [cov for cov in covariates[event] if cov not in df.columns]
                cov_not_in_df.extend(event_cov_not_in_df)
        if len(cov_not_in_df) > 0:
            raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")

    #pandarallel.initialize(verbose=verbose, nb_workers=nb_workers)
    if covariates is None:
        covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]]
    self.covariates = covariates
    self.event_type_col = event_type_col
    self.duration_col = duration_col
    self.pid_col = pid_col
    self.times = sorted(df[duration_col].unique())

    if not skip_expansion:
        expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
                                             pid_col=pid_col)
    else:
        print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
        expanded_df = df

    self.beta_models = self._fit_beta(expanded_df, self.events, **fit_beta_kwargs)

    y_t = (df[duration_col]
           .value_counts()
           .sort_index(ascending=False)  # each event count for its occurring time and the times before
           .cumsum()
           .sort_index()
           )
    n_jt = df.groupby([event_type_col, duration_col]).size().to_frame().reset_index()
    n_jt.columns = [event_type_col, duration_col, 'n_jt']

    for event in self.events:

        n_et = n_jt[n_jt[event_type_col] == event].copy()

        if isinstance(self.beta_models[event], CoxPHFitter):
            self.beta_models_params_attr = 'params_'
            _res = Parallel(n_jobs=nb_workers)(delayed(minimize)(self._alpha_jt, x0=x0,
                                                                 args=(df, y_t.loc[row[duration_col]],
                                                                       getattr(self.beta_models[event],
                                                                               self.beta_models_params_attr),
                                                                       row['n_jt'],
                                                                       row[duration_col], event),
                                                                 method='BFGS',
                                                                 options={'gtol': 1e-7, 'eps': 1.5e-08,
                                                                          'maxiter': 200})
                                                                 for _, row in n_et.iterrows())
            n_et['success'] = Parallel(n_jobs=nb_workers)(delayed(lambda row: row.success)(val)
                                                          for val in _res)
            n_et['alpha_jt'] = Parallel(n_jobs=nb_workers)(delayed(lambda row: row.x[0])(val)
                                                           for val in _res)

        elif isinstance(self.beta_models[event], ConditionalResultsWrapper) or \
                isinstance(self.beta_models[event], RegularizedResultsWrapper):
            self.beta_models_params_attr = 'params'
            for idx, row in n_et.iterrows():
                _res = minimize(self._alpha_jt,
                                x0=x0,
                                args=(df,
                                      y_t.loc[row[duration_col]],
                                      getattr(self.beta_models[event], self.beta_models_params_attr),
                                      row['n_jt'],
                                      row[duration_col],
                                      event),
                                method='BFGS',
                                options={'gtol': 1e-7, 'eps': 1.5e-08, 'maxiter': 200})
                n_et.loc[idx, 'success'] = _res.success
                n_et.loc[idx, 'alpha_jt'] = _res.x[0]
        else:
            raise ValueError

        # n_et['opt_res'] = n_et.parallel_apply(lambda row: minimize(self._alpha_jt, x0=x0,
        #                         args=(df, y_t.loc[row[duration_col]], event_beta_params, row['n_jt'],
        #                         row[duration_col], event), method='BFGS',
        #                         options={'gtol': 1e-7, 'eps': 1.5e-08, 'maxiter': 200}), axis=1)
        # n_et['success'] = n_et['opt_res'].parallel_apply(lambda val: val.success)
        # n_et['alpha_jt'] = n_et['opt_res'].parallel_apply(lambda val: val.x[0])

        assert_fit(n_et, self.times[:-1], event_type_col=event_type_col, duration_col=duration_col)  # todo move basic input validation before any optimization
        self.event_models[event] = [self.beta_models[event], n_et]
        self.alpha_df = pd.concat([self.alpha_df, n_et], ignore_index=True)
    return self.event_models

get_alpha_df() ¤

This function returns the Alpha coefficients for all the events.

Returns:

Name Type Description
alpha_df DataFrame

Alpha coefficients Dataframe

Source code in src/pydts/fitters.py
def get_alpha_df(self):
    """
    This function returns the Alpha coefficients for all the events.

    Returns:
        alpha_df (pandas.DataFrame): Alpha coefficients Dataframe
    """

    alpha_df = pd.DataFrame()
    for event, model in self.event_models.items():
        model_alpha_df = model[1].set_index([self.event_type_col, self.duration_col])
        model_alpha_df.columns = pd.MultiIndex.from_product([[event], model_alpha_df.columns])
        alpha_df = pd.concat([alpha_df, model_alpha_df], axis=1)

    return alpha_df

get_beta_SE() ¤

This function returns the Beta coefficients and their Standard Errors for all the events.

Returns:

Name Type Description
se_df DataFrame

Beta coefficients and Standard Errors Dataframe

Source code in src/pydts/fitters.py
def get_beta_SE(self):
    """
    This function returns the Beta coefficients and their Standard Errors for all the events.

    Returns:
        se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe
    """

    full_table = pd.DataFrame()
    for event in self.events:
        if isinstance(self.beta_models[event], RegularizedResultsWrapper):
            _p = self.beta_models[event].params.copy()
            _p.name = 'coef'
            full_table = pd.concat([full_table,
                                    pd.concat([_p], keys=[event], axis=1)],
                                   axis=1)
        else:
            summary = self.beta_models[event].summary()
            summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
            summary_df.columns = summary_df.iloc[0]
            summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
            summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
            full_table = pd.concat([full_table, summary_df.iloc[-len(self.covariates):]], axis=1)
    return full_table

plot_all_events_alpha(ax=None, scatter_kwargs={}, colors=COLORS, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, ticklabelsize=15) ¤

This function plots a scatter plot of the $ alpha_{jt} $ coefficients of all the events.

Parameters:

Name Type Description Default
ax (Axes, Optional)

ax to use

None
scatter_kwargs (dict, Optional)

keywords to pass to the scatter function

{}
colors (list, Optional)

colors names

COLORS
show (bool, Optional)

if to use plt.show()

True
title (str, Optional)

axes title

None
xlabel (str, Optional)

axes xlabel

't'
ylabel (str, Optional)

axes ylabel

'$\\alpha_{jt}$'
fontsize (int, Optional)

axes title, xlabel, ylabel fontsize

18

Returns:

Name Type Description
ax Axes

output figure

Source code in src/pydts/fitters.py
def plot_all_events_alpha(self, ax: plt.Axes = None, scatter_kwargs: dict = {}, colors: list = COLORS,
                          show: bool = True, title: Union[str, None] = None, xlabel: str = 't',
                          ylabel: str = r'$\alpha_{jt}$', fontsize: int = 18, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots a scatter plot of the $ alpha_{jt} $ coefficients of all the events.

    Args:
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        scatter_kwargs (dict, Optional): keywords to pass to the scatter function
        colors (list, Optional): colors names
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    title = r'$\alpha_{jt}$' + f' for all events' if title is None else title
    for idx, (event, model) in enumerate(self.event_models.items()):
        label = f'{event}'
        color = colors[idx % len(colors)]
        self.plot_event_alpha(event=event, ax=ax, scatter_kwargs=scatter_kwargs, show=False, title=title,
                              ylabel=ylabel, xlabel=xlabel, fontsize=fontsize, label=label, color=color,
                              ticklabelsize=ticklabelsize)
    ax.legend()
    if show:
        plt.show()
    return ax

plot_all_events_beta(ax=None, colors=COLORS, show=True, title=None, xlabel='Value', ylabel='$\\beta_{j}$', fontsize=18, ticklabelsize=15) ¤

This function plots the $ beta_{j} $ coefficients and standard errors of all the events.

Parameters:

Name Type Description Default
ax (Axes, Optional)

ax to use

None
colors (list, Optional)

colors names

COLORS
show (bool, Optional)

if to use plt.show()

True
title (str, Optional)

axes title

None
xlabel (str, Optional)

axes xlabel

'Value'
ylabel (str, Optional)

axes ylabel

'$\\beta_{j}$'
fontsize (int, Optional)

axes title, xlabel, ylabel fontsize

18
ticklabelsize (int, Optional)

axes xticklabels, yticklabels fontsize

15

Returns:

Name Type Description
ax Axes

output figure

Source code in src/pydts/fitters.py
def plot_all_events_beta(self, ax: plt.Axes = None, colors: list = COLORS, show: bool = True,
                         title: Union[str, None] = None, xlabel: str = 'Value',  ylabel: str = r'$\beta_{j}$',
                         fontsize: int = 18, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots the $ beta_{j} $ coefficients and standard errors of all the events.

    Args:
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        colors (list, Optional): colors names
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize
        ticklabelsize (int, Optional): axes xticklabels, yticklabels fontsize

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    title = r'$\beta_{j}$' + f' for all events' if title is None else title
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    se_df = self.get_beta_SE()

    for idx, col in enumerate(se_df.columns):
        if idx % 2 == 1:
            continue
        y = np.arange((idx//2)*len(se_df), (1+(idx//2))*len(se_df))
        ax.errorbar(x=se_df.iloc[:, idx].values, y=y,
                   color=colors[idx % len(colors)], xerr=se_df.iloc[:, idx+1].values, label=f'{col}',
                   markersize=6, ls='', marker='o')

    yt = list(se_df.index) * (len(se_df.columns) // 2)
    ax.set_yticks(np.arange(0, len(yt)))
    ax.set_yticklabels(yt)
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_ylabel(ylabel, fontsize=fontsize)
    ax.grid()
    plt.gca().invert_yaxis()
    ax.legend()
    if show:
        plt.show()
    return ax

plot_event_alpha(event, ax=None, scatter_kwargs={}, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, color=None, label=None, ticklabelsize=15) ¤

This function plots a scatter plot of the $ alpha_{jt} $ coefficients of a specific event.

Parameters:

Name Type Description Default
event Union[str, int]

event name

required
ax (Axes, Optional)

ax to use

None
scatter_kwargs (dict, Optional)

keywords to pass to the scatter function

{}
show (bool, Optional)

if to use plt.show()

True
title (str, Optional)

axes title

None
xlabel (str, Optional)

axes xlabel

't'
ylabel (str, Optional)

axes ylabel

'$\\alpha_{jt}$'
fontsize (int, Optional)

axes title, xlabel, ylabel fontsize

18
color (str, Optional)

color name to use

None
label (str, Optional)

label name

None

Returns:

Name Type Description
ax Axes

output figure

Source code in src/pydts/fitters.py
def plot_event_alpha(self, event: Union[str, int], ax: plt.Axes = None, scatter_kwargs: dict = {},
                     show=True, title=None, xlabel='t', ylabel=r'$\alpha_{jt}$', fontsize=18,
                     color: str = None, label: str = None, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots a scatter plot of the $ alpha_{jt} $ coefficients of a specific event.

    Args:
        event (Union[str, int]): event name
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        scatter_kwargs (dict, Optional): keywords to pass to the scatter function
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize
        color (str, Optional): color name to use
        label (str, Optional): label name

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """

    assert event in self.events, f"Cannot plot event {event} alpha - it was not included during .fit()"

    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    title = r'$\alpha_{jt}$' + f' for event {event}' if title is None else title
    label = f'{event}' if label is None else label
    color = 'tab:blue' if color is None else color
    alpha_df = self.event_models[event][1]
    ax.scatter(alpha_df[self.duration_col].values, alpha_df['alpha_jt'].values, label=label,
               color=color, **scatter_kwargs)
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_ylabel(ylabel, fontsize=fontsize)
    if show:
        plt.show()
    return ax

predict(df, **kwargs) ¤

Source code in src/pydts/base_fitters.py
def predict(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    raise NotImplementedError

predict_cumulative_incident_function(df) ¤

This function adds columns of the predicted hazard function, overall survival, probabilities of event occurance and cumulative incident function (CIF) to the given dataframe.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns included

required

Returns:

Name Type Description
df DataFrame

dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py
def predict_cumulative_incident_function(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function adds columns of the predicted hazard function, overall survival, probabilities of event occurance
    and cumulative incident function (CIF) to the given dataframe.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns

    """
    self._validate_covariates_in_df(df.head())

    for event in self.events:
        if f'cif_j{event}_at_t{self.times[-2]}' not in df.columns:
            df = self.predict_event_cumulative_incident_function(df=df, event=event)
    return df

predict_event_cumulative_incident_function(df, event) ¤

This function adds a specific event columns of the predicted hazard function, overall survival, probabilities of event occurance and cumulative incident function (CIF) to the given dataframe.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns included

required
event Union[str, int]

event name

required

Returns:

Name Type Description
df DataFrame

dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py
def predict_event_cumulative_incident_function(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function adds a specific event columns of the predicted hazard function, overall survival, probabilities
    of event occurance and cumulative incident function (CIF) to the given dataframe.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{self.times[-2]}' not in df.columns:
        df = self.predict_prob_events(df=df)
    cols = [f'prob_j{event}_at_t{t}' for t in self.times[:-1]]
    cif_df = df[cols].cumsum(axis=1)
    cif_df.columns = [f'cif_j{event}_at_t{t}' for t in self.times[:-1]]
    df = pd.concat([df, cif_df], axis=1)
    return df

predict_hazard_all(df) ¤

This function calculates the hazard for all the events at all time values included in the training set for each event.

Parameters:

Name Type Description Default
df DataFrame

samples to predict for

required

Returns:

Name Type Description
df DataFrame

samples with the prediction columns

Source code in src/pydts/base_fitters.py
def predict_hazard_all(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates the hazard for all the events at all time values included in the training set for each
    event.

    Args:
        df (pd.DataFrame): samples to predict for

    Returns:
        df (pd.DataFrame): samples with the prediction columns

    """
    self._validate_covariates_in_df(df.head())
    df = self.predict_hazard_t(df, t=self.times[:-1])
    return df

predict_hazard_jt(df, event, t) ¤

This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.

Parameters:

Name Type Description Default
df DataFrame

samples to predict for

required
event Union[str, int]

event name

required
t Union[Iterable, int]

times to calculate the hazard for

required

Returns:

Name Type Description
df DataFrame

samples with the prediction columns

Source code in src/pydts/fitters.py
def predict_hazard_jt(self,
                      df: pd.DataFrame,
                      event: Union[str, int],
                      t: Union[Iterable, int]) -> pd.DataFrame:
    """
    This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.

    Args:
        df (pd.DataFrame): samples to predict for
        event (Union[str, int]): event name
        t (Union[Iterable, int]): times to calculate the hazard for

    Returns:
        df (pd.DataFrame): samples with the prediction columns
    """
    self._validate_covariates_in_df(df.head())
    t = self._validate_t(t, return_iter=True)
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"

    model = self.event_models[event]
    alpha_df = model[1].set_index(self.duration_col)['alpha_jt'].copy()

    _t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
    if len(_t) == 0:
        return df
    temp_df = df.copy()
    if isinstance(self.covariates, list):
        beta_j_x = temp_df[self.covariates].dot(getattr(model[0], self.beta_models_params_attr))
    elif isinstance(self.covariates, dict):
        beta_j_x = temp_df[self.covariates[event]].dot(getattr(model[0], self.beta_models_params_attr))
    temp_df[[f'hazard_j{event}_t{c}' for c in _t]] = pd.concat(
        [self._hazard_inverse_transformation(alpha_df[c] + beta_j_x) for c in _t], axis=1).values
    return temp_df

predict_hazard_t(df, t) ¤

This function calculates the hazard for all the events at the requested time values if they were included in the training set of each event.

Parameters:

Name Type Description Default
df DataFrame

samples to predict for

required
t (int, array)

times to calculate the hazard for

required

Returns:

Name Type Description
df DataFrame

samples with the prediction columns

Source code in src/pydts/base_fitters.py
def predict_hazard_t(self, df: pd.DataFrame, t: Union[int, np.array]) -> pd.DataFrame:
    """
    This function calculates the hazard for all the events at the requested time values if they were included in
    the training set of each event.

    Args:
        df (pd.DataFrame): samples to predict for
        t (int, np.array): times to calculate the hazard for

    Returns:
        df (pd.DataFrame): samples with the prediction columns
    """
    t = self._validate_t(t)
    self._validate_covariates_in_df(df.head())

    for event, model in self.event_models.items():
        df = self.predict_hazard_jt(df=df, event=event, t=t)
    return df

predict_marginal_prob_all_events(df) ¤

This function calculates the marginal probability per event given the covariates for all the events.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns included

required

Returns:

Name Type Description
df DataFrame

dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py
def predict_marginal_prob_all_events(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates the marginal probability per event given the covariates for all the events.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns
    """
    self._validate_covariates_in_df(df.head())
    for event in self.events:
        df = self.predict_marginal_prob_event_j(df=df, event=event)
    return df

predict_marginal_prob_event_j(df, event) ¤

This function calculates the marginal probability of an event given the covariates.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns included

required
event Union[str, int]

event name

required

Returns:

Name Type Description
df DataFrame

dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py
def predict_marginal_prob_event_j(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function calculates the marginal probability of an event given the covariates.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns
    """

    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{self.times[-2]}' not in df.columns:
        df = self.predict_prob_event_j_all(df=df, event=event)
    cols = [f'prob_j{event}_at_t{_t}' for _t in self.times[:-1]]
    marginal_prob = df[cols].sum(axis=1)
    marginal_prob.name = f'marginal_prob_j{event}'
    return pd.concat([df, marginal_prob], axis=1)

predict_overall_survival(df, t=None, return_hazards=False) ¤

This function adds columns of the overall survival until time t. Args: df (pandas.DataFrame): dataframe with covariates columns t (int): time return_hazards (bool): if to keep the hazard columns

Returns:

Name Type Description
df DataFrame

dataframe with the additional overall survival columns

Source code in src/pydts/base_fitters.py
def predict_overall_survival(self,
                             df: pd.DataFrame,
                             t: int = None,
                             return_hazards: bool = False) -> pd.DataFrame:
    """
    This function adds columns of the overall survival until time t.
    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        t (int): time
        return_hazards (bool): if to keep the hazard columns

    Returns:
        df (pandas.DataFrame): dataframe with the additional overall survival columns

    """
    if t is not None:
        self._validate_t(t, return_iter=False)
    self._validate_covariates_in_df(df.head())

    all_hazards = self.predict_hazard_all(df)
    _times = self.times[:-1] if t is None else [_t for _t in self.times[:-1] if _t <= t]
    overall = pd.DataFrame()
    for t_i in _times:
        cols = [f'hazard_j{e}_t{t_i}' for e in self.events]
        t_i_hazard = 1 - all_hazards[cols].sum(axis=1)
        t_i_hazard.name = f'overall_survival_t{t_i}'
        overall = pd.concat([overall, t_i_hazard], axis=1)
    overall = pd.concat([df, overall.cumprod(axis=1)], axis=1)

    if return_hazards:
        cols = all_hazards.columns[all_hazards.columns.str.startswith("hazard_")]
        cols = cols.difference(overall.columns)
        if len(cols) > 0:
            overall = pd.concat([overall, all_hazards[cols]], axis=1)
    return overall

predict_prob_event_j_all(df, event) ¤

This function adds columns of a specific event occurrence probabilities.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns

required
event Union[str, int]

event name

required

Returns:

Name Type Description
df DataFrame

dataframe with probabilities columns

Source code in src/pydts/base_fitters.py
def predict_prob_event_j_all(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function adds columns of a specific event occurrence probabilities.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with probabilities columns

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'overall_survival_t{self.times[-2]}' not in df.columns:
        df = self.predict_overall_survival(df, return_hazards=True)
    for t in self.times[:-1]:
        if f'prob_j{event}_at_t{t}' not in df.columns:
            df = self.predict_prob_event_j_at_t(df=df, event=event, t=t)
    return df

predict_prob_event_j_at_t(df, event, t) ¤

This function adds a column with probability of occurance of a specific event at a specific a time.

Parameters:

Name Type Description Default
df DataFrame

dataframe with covariates columns

required
event Union[str, int]

event name

required
t int

time

required

Returns:

Name Type Description
df DataFrame

dataframe an additional probability column

Source code in src/pydts/base_fitters.py
def predict_prob_event_j_at_t(self, df: pd.DataFrame, event: Union[str, int], t: int) -> pd.DataFrame:
    """
    This function adds a column with probability of occurance of a specific event at a specific a time.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        event (Union[str, int]): event name
        t (int): time

    Returns:
        df (pandas.DataFrame): dataframe an additional probability column

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_t(t, return_iter=False)
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{t}' not in df.columns:
        if t == 1:
            if f'hazard_j{event}_t{t}' not in df.columns:
                df = self.predict_hazard_jt(df=df, event=event, t=t)
            df[f'prob_j{event}_at_t{t}'] = df[f'hazard_j{event}_t{t}']
            return df
        elif not f'overall_survival_t{t - 1}' in df.columns:
            df = self.predict_overall_survival(df, t=t, return_hazards=True)
        elif not f'hazard_j{event}_t{t}' in df.columns:
            df = self.predict_hazard_t(df, t=np.array([_t for _t in self.times[:-1] if _t <= t]))
        df[f'prob_j{event}_at_t{t}'] = df[f'overall_survival_t{t - 1}'] * df[f'hazard_j{event}_t{t}']
    return df

predict_prob_events(df) ¤

This function adds columns of all the events occurance probabilities. Args: df (pandas.DataFrame): dataframe with covariates columns

Returns:

Name Type Description
df DataFrame

dataframe with probabilities columns

Source code in src/pydts/base_fitters.py
def predict_prob_events(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function adds columns of all the events occurance probabilities.
    Args:
        df (pandas.DataFrame): dataframe with covariates columns

    Returns:
        df (pandas.DataFrame): dataframe with probabilities columns

    """
    self._validate_covariates_in_df(df.head())

    for event in self.events:
        df = self.predict_prob_event_j_all(df=df, event=event)
    return df

print_summary(summary_func='print_summary', summary_kwargs={}) ¤

This method prints the summary of the fitted models for all the events.

Parameters:

Name Type Description Default
summary_func (str, Optional)

print summary method of the fitted model type ("summary", "print_summary").

'print_summary'
summary_kwargs (dict, Optional)

Keyword arguments to pass to the model summary function.

{}

Returns:

Type Description
None

None

Source code in src/pydts/fitters.py
def print_summary(self,
                  summary_func: str = "print_summary",
                  summary_kwargs: dict = {}) -> None:
    """
    This method prints the summary of the fitted models for all the events.

    Args:
        summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
        summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.

    Returns:
        None
    """
    from IPython.display import display
    display(self.get_beta_SE())

    for event, model in self.event_models.items():
        print(f'\n\nModel summary for event: {event}')
        display(model[1].set_index([self.event_type_col, self.duration_col]))