The Two Stages Procedure of Meir and Gorfine (2023) - Efron

`pydts.fitters.TwoStagesFitter()` ¤

Bases: ExpansionBasedFitter

This class implements the approach of Meir et al. (2022):

Source code in src/pydts/fitters.py

def __init__(self):
    super().__init__()
    self.alpha_df = pd.DataFrame()
    self.beta_models = {}
    self.beta_models_params_attr = 'params_'

`alpha_df = pd.DataFrame()` `instance-attribute` ¤

`beta_models = {}` `instance-attribute` ¤

`beta_models_params_attr = 'params_'` `instance-attribute` ¤

`covariates = None` `instance-attribute` ¤

`duration_col = None` `instance-attribute` ¤

`event_models = {}` `instance-attribute` ¤

`event_type_col = None` `instance-attribute` ¤

`events = None` `instance-attribute` ¤

`expanded_df = pd.DataFrame()` `instance-attribute` ¤

`formula = None` `instance-attribute` ¤

`pid_col = None` `instance-attribute` ¤

`times = None` `instance-attribute` ¤

`_alpha_jt(x, df, y_t, beta_j, n_jt, t, event)` ¤

Source code in src/pydts/fitters.py

def _alpha_jt(self, x, df, y_t, beta_j, n_jt, t, event):
    # Alpha_jt optimization objective
    partial_df = df[df[self.duration_col] >= t]
    if isinstance(self.covariates, list):
        expit_add = np.dot(partial_df[self.covariates], beta_j)
    elif isinstance(self.covariates, dict):
        expit_add = np.dot(partial_df[self.covariates[event]], beta_j)
    else:
        raise ValueError
    return ((1 / y_t) * np.sum(expit(x + expit_add)) - (n_jt / y_t)) ** 2

`_expand_data(df, event_type_col, duration_col, pid_col)` ¤

This method expands the raw data as explained in Lee et al. 2018

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Dataframe to expand.	required
`event_type_col`	`str`	The event type column name (must be a column in df), Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.	required
`duration_col`	`str`	Last follow up time column name (must be a column in df).	required
`pid_col`	`str`	Sample ID column name (must be a column in df).	required

Returns:

Type	Description
`DataFrame`	Expanded df (pandas.DataFrame): the expanded dataframe.

Source code in src/pydts/base_fitters.py

def _expand_data(self,
                 df: pd.DataFrame,
                 event_type_col: str,
                 duration_col: str,
                 pid_col: str) -> pd.DataFrame:
    """
    This method expands the raw data as explained in Lee et al. 2018

    Args:
        df (pandas.DataFrame): Dataframe to expand.
        event_type_col (str): The event type column name (must be a column in df),
                              Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).

    Returns:
        Expanded df (pandas.DataFrame): the expanded dataframe.
    """
    self._validate_cols(df, event_type_col, duration_col, pid_col)
    return get_expanded_df(df=df, event_type_col=event_type_col, duration_col=duration_col, pid_col=pid_col)

`_fit_beta(expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={})` ¤

Source code in src/pydts/fitters.py

def _fit_beta(self, expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}):
    # Model fitting for conditional estimation of Beta_j for all events
    _model_kwargs_per_event = np.any([event in model_kwargs.keys() for event in events])
    _model_fit_kwargs_per_event = np.any([event in model_fit_kwargs.keys() for event in events])
    beta_models = {}
    for event in events:
        _model_kwargs = model_kwargs[event] if _model_kwargs_per_event else model_kwargs
        _model_fit_kwargs = model_fit_kwargs[event] if _model_fit_kwargs_per_event else model_fit_kwargs
        beta_models[event] = self._fit_event_beta(expanded_df=expanded_df, event=event,
                                                  model=model, model_kwargs=_model_kwargs,
                                                  model_fit_kwargs=_model_fit_kwargs)
    return beta_models

`_fit_event_beta(expanded_df, event, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={})` ¤

Source code in src/pydts/fitters.py

def _fit_event_beta(self, expanded_df, event, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}):
    # Model fitting for conditional estimation of Beta_j for specific event
    if isinstance(self.covariates, list):
        strata_df = expanded_df[self.covariates + [f'j_{event}', self.duration_col]].copy()
    elif isinstance(self.covariates, dict):
        strata_df = expanded_df[self.covariates[event] + [f'j_{event}', self.duration_col]].copy()
    else:
        raise TypeError
    strata_df.loc[:, f'{self.duration_col}_copy'] = np.ones_like(expanded_df[self.duration_col])

    beta_j_model = model(**model_kwargs)
    if isinstance(self.covariates, list):
        beta_j_model.fit(df=strata_df[self.covariates + [f'{self.duration_col}', f'{self.duration_col}_copy', f'j_{event}']],
                         duration_col=f'{self.duration_col}_copy', event_col=f'j_{event}', strata=self.duration_col,
                         **model_fit_kwargs, batch_mode=False)
    elif isinstance(self.covariates, dict):
        beta_j_model.fit(df=strata_df[self.covariates[event] + [f'{self.duration_col}', f'{self.duration_col}_copy', f'j_{event}']],
                         duration_col=f'{self.duration_col}_copy', event_col=f'j_{event}', strata=self.duration_col,
                         **model_fit_kwargs, batch_mode=False)
    return beta_j_model

`_hazard_inverse_transformation(a)` ¤

This function defines the inverse transformation of the hazard function such that $\lambda_j (t | Z) = h^{-1} ( lpha_{jt} + Z^{T} eta_{j} )$

Parameters:

Name	Type	Description	Default
`a`	`Union[int, array, Series, DataFrame]`		required

Returns:

Name	Type	Description
`i`	`Union[int, array, Series, DataFrame]`	the inverse function applied on a. $ h^{-1} (a) $

Source code in src/pydts/fitters.py

def _hazard_inverse_transformation(self, a: Union[int, np.array, pd.Series, pd.DataFrame]) -> \
        Union[int, np.array, pd.Series, pd.DataFrame]:
    """
    This function defines the inverse transformation of the hazard function such that $\lambda_j (t | Z) = h^{-1} ( \alpha_{jt} + Z^{T} \beta_{j} )$

    Args:
        a (Union[int, np.array, pd.Series, pd.DataFrame]):

    Returns:
        i (Union[int, np.array, pd.Series, pd.DataFrame]): the inverse function applied on a. $ h^{-1} (a) $
    """
    i = expit(a)
    return i

`_hazard_transformation(a)` ¤

This function defines the transformation of the hazard function such that $ h ( \lambda_j (t | Z) ) = lpha_{jt} + Z^{T} eta_{j} $

Parameters:

Name	Type	Description	Default
`a`	`Union[int, array, Series, DataFrame]`		required

Returns:

Name	Type	Description
`i`	`Union[int, array, Series, DataFrame]`	the inverse function applied on a. $ h^{-1} (a)$

Source code in src/pydts/fitters.py

def _hazard_transformation(self, a: Union[int, np.array, pd.Series, pd.DataFrame]) -> \
        Union[int, np.array, pd.Series, pd.DataFrame]:
    """
    This function defines the transformation of the hazard function such that $ h ( \lambda_j (t | Z) ) = \alpha_{jt} + Z^{T} \beta_{j} $

    Args:
        a (Union[int, np.array, pd.Series, pd.DataFrame]):

    Returns:
        i (Union[int, np.array, pd.Series, pd.DataFrame]): the inverse function applied on a. $ h^{-1} (a)$
    """

    i = logit(a)
    return i

`_validate_cols(df, event_type_col, duration_col, pid_col)` ¤

Source code in src/pydts/base_fitters.py

def _validate_cols(self, df, event_type_col, duration_col, pid_col):
    assert event_type_col in df.columns, f'Event type column is missing from df: {event_type_col}'
    assert duration_col in df.columns, f'Duration column is missing from df: {duration_col}'
    assert pid_col in df.columns, f'Observation ID column is missing from df: {pid_col}'

`_validate_covariates_in_df(df)` ¤

Source code in src/pydts/base_fitters.py

def _validate_covariates_in_df(self, df):
    cov_not_fitted = []
    if isinstance(self.covariates, list):
        cov_not_fitted = [cov for cov in self.covariates if cov not in df.columns]
    elif isinstance(self.covariates, dict):
        for event in self.events:
            event_cov_not_fitted = [cov for cov in self.covariates[event] if cov not in df.columns]
            cov_not_fitted.extend(event_cov_not_fitted)
    assert len(cov_not_fitted) == 0, \
        f"Cannot predict - required covariates are missing from df: {cov_not_fitted}"

`_validate_t(t, return_iter=True)` ¤

Source code in src/pydts/base_fitters.py

def _validate_t(self, t, return_iter=True):
    _t = np.array([t]) if not isinstance(t, Iterable) else t
    t_i_not_fitted = [t_i for t_i in _t if (t_i not in self.times)]
    assert len(t_i_not_fitted) == 0, \
        f"Cannot predict for times which were not included during .fit(): {t_i_not_fitted}"
    if return_iter:
        return _t
    return t

`evaluate(test_df, oracle_col='T', **kwargs)` ¤

Source code in src/pydts/base_fitters.py

def evaluate(self, test_df: pd.DataFrame, oracle_col: str = 'T', **kwargs) -> float:
    raise NotImplementedError

`fit(df, covariates=None, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, x0=0, fit_beta_kwargs={}, verbose=2, nb_workers=WORKERS)` ¤

This method fits a model to the discrete data.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	training data for fitting the model	required
`covariates`	`list`	list of covariates to be used in estimating the regression coefficients	`None`
`event_type_col`	`str`	The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.	`'J'`
`duration_col`	`str`	Last follow up time column name (must be a column in df).	`'X'`
`pid_col`	`str`	Sample ID column name (must be a column in df).	`'pid'`
`skip_expansion`	`boolean`	Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded. When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.	`False`
`x0`	`(Union[array, int], Optional)`	initial guess to pass to scipy.optimize.minimize function	`0`
`fit_beta_kwargs`	`(dict, Optional)`	Keyword arguments to pass on to the estimation procedure.	`{}`
`verbose`	`(int, Optional)`	The verbosity level of pandaallel	`2`
`nb_workers`	`(int, Optional)`	The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.	`WORKERS`

Returns:

Name	Type	Description
`event_models`	`dict`	Fitted models dictionary. Keys - event names, Values - fitted models for the event.

Source code in src/pydts/fitters.py

def fit(self,
        df: pd.DataFrame,
        covariates: Union[list, dict] = None,
        event_type_col: str = 'J',
        duration_col: str = 'X',
        pid_col: str = 'pid',
        skip_expansion: bool = False,
        x0: Union[np.array, int] = 0,
        fit_beta_kwargs: dict = {},
        verbose: int = 2,
        nb_workers: int = WORKERS) -> dict:
    """
    This method fits a model to the discrete data.

    Args:
        df (pd.DataFrame): training data for fitting the model
        covariates (list): list of covariates to be used in estimating the regression coefficients
        event_type_col (str): The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).
        skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded. When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
        x0 (Union[numpy.array, int], Optional): initial guess to pass to scipy.optimize.minimize function
        fit_beta_kwargs (dict, Optional): Keyword arguments to pass on to the estimation procedure.
        verbose (int, Optional): The verbosity level of pandaallel
        nb_workers (int, Optional): The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.

    Returns:
        event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.
    """

    self._validate_cols(df, event_type_col, duration_col, pid_col)
    self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
    if (covariates is not None):
        cov_not_in_df = []
        if isinstance(covariates, list):
            cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
        elif isinstance(covariates, dict):
            for event in self.events:
                event_cov_not_in_df = [cov for cov in covariates[event] if cov not in df.columns]
                cov_not_in_df.extend(event_cov_not_in_df)
        if len(cov_not_in_df) > 0:
            raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")

    #pandarallel.initialize(verbose=verbose, nb_workers=nb_workers)
    if covariates is None:
        covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]]
    self.covariates = covariates
    self.event_type_col = event_type_col
    self.duration_col = duration_col
    self.pid_col = pid_col
    self.times = sorted(df[duration_col].unique())

    if not skip_expansion:
        expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
                                             pid_col=pid_col)
    else:
        print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
        expanded_df = df

    self.beta_models = self._fit_beta(expanded_df, self.events, **fit_beta_kwargs)

    y_t = (df[duration_col]
           .value_counts()
           .sort_index(ascending=False)  # each event count for its occurring time and the times before
           .cumsum()
           .sort_index()
           )
    n_jt = df.groupby([event_type_col, duration_col]).size().to_frame().reset_index()
    n_jt.columns = [event_type_col, duration_col, 'n_jt']

    for event in self.events:

        n_et = n_jt[n_jt[event_type_col] == event].copy()

        if isinstance(self.beta_models[event], CoxPHFitter):
            self.beta_models_params_attr = 'params_'
            _res = Parallel(n_jobs=nb_workers)(delayed(minimize)(self._alpha_jt, x0=x0,
                                                                 args=(df, y_t.loc[row[duration_col]],
                                                                       getattr(self.beta_models[event],
                                                                               self.beta_models_params_attr),
                                                                       row['n_jt'],
                                                                       row[duration_col], event),
                                                                 method='BFGS',
                                                                 options={'gtol': 1e-7, 'eps': 1.5e-08,
                                                                          'maxiter': 200})
                                                                 for _, row in n_et.iterrows())
            n_et['success'] = Parallel(n_jobs=nb_workers)(delayed(lambda row: row.success)(val)
                                                          for val in _res)
            n_et['alpha_jt'] = Parallel(n_jobs=nb_workers)(delayed(lambda row: row.x[0])(val)
                                                           for val in _res)

        elif isinstance(self.beta_models[event], ConditionalResultsWrapper) or \
                isinstance(self.beta_models[event], RegularizedResultsWrapper):
            self.beta_models_params_attr = 'params'
            for idx, row in n_et.iterrows():
                _res = minimize(self._alpha_jt,
                                x0=x0,
                                args=(df,
                                      y_t.loc[row[duration_col]],
                                      getattr(self.beta_models[event], self.beta_models_params_attr),
                                      row['n_jt'],
                                      row[duration_col],
                                      event),
                                method='BFGS',
                                options={'gtol': 1e-7, 'eps': 1.5e-08, 'maxiter': 200})
                n_et.loc[idx, 'success'] = _res.success
                n_et.loc[idx, 'alpha_jt'] = _res.x[0]
        else:
            raise ValueError

        # n_et['opt_res'] = n_et.parallel_apply(lambda row: minimize(self._alpha_jt, x0=x0,
        #                         args=(df, y_t.loc[row[duration_col]], event_beta_params, row['n_jt'],
        #                         row[duration_col], event), method='BFGS',
        #                         options={'gtol': 1e-7, 'eps': 1.5e-08, 'maxiter': 200}), axis=1)
        # n_et['success'] = n_et['opt_res'].parallel_apply(lambda val: val.success)
        # n_et['alpha_jt'] = n_et['opt_res'].parallel_apply(lambda val: val.x[0])

        assert_fit(n_et, self.times[:-1], event_type_col=event_type_col, duration_col=duration_col)  # todo move basic input validation before any optimization
        self.event_models[event] = [self.beta_models[event], n_et]
        self.alpha_df = pd.concat([self.alpha_df, n_et], ignore_index=True)
    return self.event_models

`get_alpha_df()` ¤

This function returns the Alpha coefficients for all the events.

Returns:

Name	Type	Description
`alpha_df`	`DataFrame`	Alpha coefficients Dataframe

Source code in src/pydts/fitters.py

def get_alpha_df(self):
    """
    This function returns the Alpha coefficients for all the events.

    Returns:
        alpha_df (pandas.DataFrame): Alpha coefficients Dataframe
    """

    alpha_df = pd.DataFrame()
    for event, model in self.event_models.items():
        model_alpha_df = model[1].set_index([self.event_type_col, self.duration_col])
        model_alpha_df.columns = pd.MultiIndex.from_product([[event], model_alpha_df.columns])
        alpha_df = pd.concat([alpha_df, model_alpha_df], axis=1)

    return alpha_df

`get_beta_SE()` ¤

This function returns the Beta coefficients and their Standard Errors for all the events.

Returns:

Name	Type	Description
`se_df`	`DataFrame`	Beta coefficients and Standard Errors Dataframe

Source code in src/pydts/fitters.py

def get_beta_SE(self):
    """
    This function returns the Beta coefficients and their Standard Errors for all the events.

    Returns:
        se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe
    """
    se_df = pd.DataFrame()
    for event, model in self.beta_models.items():
        mdf = pd.concat([model.params_, model.standard_errors_], axis=1)
        mdf.columns = [f'j{event}_params', f'j{event}_SE']
        se_df = pd.concat([se_df, mdf], axis=1)
    return se_df

`plot_all_events_alpha(ax=None, scatter_kwargs={}, colors=COLORS, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, ticklabelsize=15)` ¤

This function plots a scatter plot of the $ alpha_{jt} $ coefficients of all the events.

Parameters:

Name	Type	Description	Default
`ax`	`(Axes, Optional)`	ax to use	`None`
`scatter_kwargs`	`(dict, Optional)`	keywords to pass to the scatter function	`{}`
`colors`	`(list, Optional)`	colors names	`COLORS`
`show`	`(bool, Optional)`	if to use plt.show()	`True`
`title`	`(str, Optional)`	axes title	`None`
`xlabel`	`(str, Optional)`	axes xlabel	`'t'`
`ylabel`	`(str, Optional)`	axes ylabel	`'$\\alpha_{jt}$'`
`fontsize`	`(int, Optional)`	axes title, xlabel, ylabel fontsize	`18`

Returns:

Name	Type	Description
`ax`	`Axes`	output figure

Source code in src/pydts/fitters.py

def plot_all_events_alpha(self, ax: plt.Axes = None, scatter_kwargs: dict = {}, colors: list = COLORS,
                          show: bool = True, title: Union[str, None] = None, xlabel: str = 't',
                          ylabel: str = r'$\alpha_{jt}$', fontsize: int = 18, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots a scatter plot of the $ alpha_{jt} $ coefficients of all the events.

    Args:
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        scatter_kwargs (dict, Optional): keywords to pass to the scatter function
        colors (list, Optional): colors names
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    title = r'$\alpha_{jt}$' + f' for all events' if title is None else title
    for idx, (event, model) in enumerate(self.event_models.items()):
        label = f'{event}'
        color = colors[idx % len(colors)]
        self.plot_event_alpha(event=event, ax=ax, scatter_kwargs=scatter_kwargs, show=False, title=title,
                              ylabel=ylabel, xlabel=xlabel, fontsize=fontsize, label=label, color=color,
                              ticklabelsize=ticklabelsize)
    ax.legend()
    if show:
        plt.show()
    return ax

`plot_all_events_beta(ax=None, colors=COLORS, show=True, title=None, xlabel='Value', ylabel='$\\beta_{j}$', fontsize=18, ticklabelsize=15)` ¤

This function plots the $ beta_{j} $ coefficients and standard errors of all the events.

Parameters:

Name	Type	Description	Default
`ax`	`(Axes, Optional)`	ax to use	`None`
`colors`	`(list, Optional)`	colors names	`COLORS`
`show`	`(bool, Optional)`	if to use plt.show()	`True`
`title`	`(str, Optional)`	axes title	`None`
`xlabel`	`(str, Optional)`	axes xlabel	`'Value'`
`ylabel`	`(str, Optional)`	axes ylabel	`'$\\beta_{j}$'`
`fontsize`	`(int, Optional)`	axes title, xlabel, ylabel fontsize	`18`
`ticklabelsize`	`(int, Optional)`	axes xticklabels, yticklabels fontsize	`15`

Returns:

Name	Type	Description
`ax`	`Axes`	output figure

Source code in src/pydts/fitters.py

def plot_all_events_beta(self, ax: plt.Axes = None, colors: list = COLORS, show: bool = True,
                         title: Union[str, None] = None, xlabel: str = 'Value',  ylabel: str = r'$\beta_{j}$',
                         fontsize: int = 18, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots the $ beta_{j} $ coefficients and standard errors of all the events.

    Args:
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        colors (list, Optional): colors names
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize
        ticklabelsize (int, Optional): axes xticklabels, yticklabels fontsize

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    title = r'$\beta_{j}$' + f' for all events' if title is None else title
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    se_df = self.get_beta_SE()

    for idx, col in enumerate(se_df.columns):
        if idx % 2 == 1:
            continue
        y = np.arange((idx//2)*len(se_df), (1+(idx//2))*len(se_df))
        ax.errorbar(x=se_df.iloc[:, idx].values, y=y,
                   color=colors[idx % len(colors)], xerr=se_df.iloc[:, idx+1].values, label=f'{col}',
                   markersize=6, ls='', marker='o')

    yt = list(se_df.index) * (len(se_df.columns) // 2)
    ax.set_yticks(np.arange(0, len(yt)))
    ax.set_yticklabels(yt)
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_ylabel(ylabel, fontsize=fontsize)
    ax.grid()
    plt.gca().invert_yaxis()
    ax.legend()
    if show:
        plt.show()
    return ax

`plot_event_alpha(event, ax=None, scatter_kwargs={}, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, color=None, label=None, ticklabelsize=15)` ¤

This function plots a scatter plot of the $ alpha_{jt} $ coefficients of a specific event.

Parameters:

Name	Type	Description	Default
`event`	`Union[str, int]`	event name	required
`ax`	`(Axes, Optional)`	ax to use	`None`
`scatter_kwargs`	`(dict, Optional)`	keywords to pass to the scatter function	`{}`
`show`	`(bool, Optional)`	if to use plt.show()	`True`
`title`	`(str, Optional)`	axes title	`None`
`xlabel`	`(str, Optional)`	axes xlabel	`'t'`
`ylabel`	`(str, Optional)`	axes ylabel	`'$\\alpha_{jt}$'`
`fontsize`	`(int, Optional)`	axes title, xlabel, ylabel fontsize	`18`
`color`	`(str, Optional)`	color name to use	`None`
`label`	`(str, Optional)`	label name	`None`

Returns:

Name	Type	Description
`ax`	`Axes`	output figure

Source code in src/pydts/fitters.py

def plot_event_alpha(self, event: Union[str, int], ax: plt.Axes = None, scatter_kwargs: dict = {},
                     show=True, title=None, xlabel='t', ylabel=r'$\alpha_{jt}$', fontsize=18,
                     color: str = None, label: str = None, ticklabelsize: int = 15) -> plt.Axes:
    """
    This function plots a scatter plot of the $ alpha_{jt} $ coefficients of a specific event.

    Args:
        event (Union[str, int]): event name
        ax (matplotlib.pyplot.Axes, Optional): ax to use
        scatter_kwargs (dict, Optional): keywords to pass to the scatter function
        show (bool, Optional): if to use plt.show()
        title (str, Optional): axes title
        xlabel (str, Optional): axes xlabel
        ylabel (str, Optional): axes ylabel
        fontsize (int, Optional): axes title, xlabel, ylabel fontsize
        color (str, Optional): color name to use
        label (str, Optional): label name

    Returns:
        ax (matplotlib.pyplot.Axes): output figure
    """

    assert event in self.events, f"Cannot plot event {event} alpha - it was not included during .fit()"

    if ax is None:
        fig, ax = plt.subplots(1, 1)
    ax.tick_params(axis='both', which='major', labelsize=ticklabelsize)
    ax.tick_params(axis='both', which='minor', labelsize=ticklabelsize)
    title = r'$\alpha_{jt}$' + f' for event {event}' if title is None else title
    label = f'{event}' if label is None else label
    color = 'tab:blue' if color is None else color
    alpha_df = self.event_models[event][1]
    ax.scatter(alpha_df[self.duration_col].values, alpha_df['alpha_jt'].values, label=label,
               color=color, **scatter_kwargs)
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_ylabel(ylabel, fontsize=fontsize)
    if show:
        plt.show()
    return ax

`predict(df, **kwargs)` ¤

Source code in src/pydts/base_fitters.py

def predict(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    raise NotImplementedError

`predict_cumulative_incident_function(df)` ¤

This function adds columns of the predicted hazard function, overall survival, probabilities of event occurance and cumulative incident function (CIF) to the given dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns included	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py

def predict_cumulative_incident_function(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function adds columns of the predicted hazard function, overall survival, probabilities of event occurance
    and cumulative incident function (CIF) to the given dataframe.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns

    """
    self._validate_covariates_in_df(df.head())

    for event in self.events:
        if f'cif_j{event}_at_t{self.times[-2]}' not in df.columns:
            df = self.predict_event_cumulative_incident_function(df=df, event=event)
    return df

`predict_event_cumulative_incident_function(df, event)` ¤

This function adds a specific event columns of the predicted hazard function, overall survival, probabilities of event occurance and cumulative incident function (CIF) to the given dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns included	required
`event`	`Union[str, int]`	event name	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py

def predict_event_cumulative_incident_function(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function adds a specific event columns of the predicted hazard function, overall survival, probabilities
    of event occurance and cumulative incident function (CIF) to the given dataframe.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{self.times[-2]}' not in df.columns:
        df = self.predict_prob_events(df=df)
    cols = [f'prob_j{event}_at_t{t}' for t in self.times[:-1]]
    cif_df = df[cols].cumsum(axis=1)
    cif_df.columns = [f'cif_j{event}_at_t{t}' for t in self.times[:-1]]
    df = pd.concat([df, cif_df], axis=1)
    return df

`predict_hazard_all(df)` ¤

This function calculates the hazard for all the events at all time values included in the training set for each event.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	samples to predict for	required

Returns:

Name	Type	Description
`df`	`DataFrame`	samples with the prediction columns

Source code in src/pydts/base_fitters.py

def predict_hazard_all(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates the hazard for all the events at all time values included in the training set for each
    event.

    Args:
        df (pd.DataFrame): samples to predict for

    Returns:
        df (pd.DataFrame): samples with the prediction columns

    """
    self._validate_covariates_in_df(df.head())
    df = self.predict_hazard_t(df, t=self.times[:-1])
    return df

`predict_hazard_jt(df, event, t)` ¤

This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	samples to predict for	required
`event`	`Union[str, int]`	event name	required
`t`	`Union[Iterable, int]`	times to calculate the hazard for	required

Returns:

Name	Type	Description
`df`	`DataFrame`	samples with the prediction columns

Source code in src/pydts/fitters.py

def predict_hazard_jt(self,
                      df: pd.DataFrame,
                      event: Union[str, int],
                      t: Union[Iterable, int]) -> pd.DataFrame:
    """
    This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.

    Args:
        df (pd.DataFrame): samples to predict for
        event (Union[str, int]): event name
        t (Union[Iterable, int]): times to calculate the hazard for

    Returns:
        df (pd.DataFrame): samples with the prediction columns
    """
    self._validate_covariates_in_df(df.head())
    t = self._validate_t(t, return_iter=True)
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"

    model = self.event_models[event]
    alpha_df = model[1].set_index(self.duration_col)['alpha_jt'].copy()

    _t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
    if len(_t) == 0:
        return df
    temp_df = df.copy()
    if isinstance(self.covariates, list):
        beta_j_x = temp_df[self.covariates].dot(getattr(model[0], self.beta_models_params_attr))
    elif isinstance(self.covariates, dict):
        beta_j_x = temp_df[self.covariates[event]].dot(getattr(model[0], self.beta_models_params_attr))
    temp_df[[f'hazard_j{event}_t{c}' for c in _t]] = pd.concat(
        [self._hazard_inverse_transformation(alpha_df[c] + beta_j_x) for c in _t], axis=1).values
    return temp_df

`predict_hazard_t(df, t)` ¤

This function calculates the hazard for all the events at the requested time values if they were included in the training set of each event.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	samples to predict for	required
`t`	`(int, array)`	times to calculate the hazard for	required

Returns:

Name	Type	Description
`df`	`DataFrame`	samples with the prediction columns

Source code in src/pydts/base_fitters.py

def predict_hazard_t(self, df: pd.DataFrame, t: Union[int, np.array]) -> pd.DataFrame:
    """
    This function calculates the hazard for all the events at the requested time values if they were included in
    the training set of each event.

    Args:
        df (pd.DataFrame): samples to predict for
        t (int, np.array): times to calculate the hazard for

    Returns:
        df (pd.DataFrame): samples with the prediction columns
    """
    t = self._validate_t(t)
    self._validate_covariates_in_df(df.head())

    for event, model in self.event_models.items():
        df = self.predict_hazard_jt(df=df, event=event, t=t)
    return df

`predict_marginal_prob_all_events(df)` ¤

This function calculates the marginal probability per event given the covariates for all the events.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns included	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py

def predict_marginal_prob_all_events(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function calculates the marginal probability per event given the covariates for all the events.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns
    """
    self._validate_covariates_in_df(df.head())
    for event in self.events:
        df = self.predict_marginal_prob_event_j(df=df, event=event)
    return df

`predict_marginal_prob_event_j(df, event)` ¤

This function calculates the marginal probability of an event given the covariates.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns included	required
`event`	`Union[str, int]`	event name	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with additional prediction columns

Source code in src/pydts/base_fitters.py

def predict_marginal_prob_event_j(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function calculates the marginal probability of an event given the covariates.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns included
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with additional prediction columns
    """

    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{self.times[-2]}' not in df.columns:
        df = self.predict_prob_event_j_all(df=df, event=event)
    cols = [f'prob_j{event}_at_t{_t}' for _t in self.times[:-1]]
    marginal_prob = df[cols].sum(axis=1)
    marginal_prob.name = f'marginal_prob_j{event}'
    return pd.concat([df, marginal_prob], axis=1)

`predict_overall_survival(df, t=None, return_hazards=False)` ¤

This function adds columns of the overall survival until time t. Args: df (pandas.DataFrame): dataframe with covariates columns t (int): time return_hazards (bool): if to keep the hazard columns

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with the additional overall survival columns

Source code in src/pydts/base_fitters.py

def predict_overall_survival(self,
                             df: pd.DataFrame,
                             t: int = None,
                             return_hazards: bool = False) -> pd.DataFrame:
    """
    This function adds columns of the overall survival until time t.
    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        t (int): time
        return_hazards (bool): if to keep the hazard columns

    Returns:
        df (pandas.DataFrame): dataframe with the additional overall survival columns

    """
    if t is not None:
        self._validate_t(t, return_iter=False)
    self._validate_covariates_in_df(df.head())

    all_hazards = self.predict_hazard_all(df)
    _times = self.times[:-1] if t is None else [_t for _t in self.times[:-1] if _t <= t]
    overall = pd.DataFrame()
    for t_i in _times:
        cols = [f'hazard_j{e}_t{t_i}' for e in self.events]
        t_i_hazard = 1 - all_hazards[cols].sum(axis=1)
        t_i_hazard.name = f'overall_survival_t{t_i}'
        overall = pd.concat([overall, t_i_hazard], axis=1)
    overall = pd.concat([df, overall.cumprod(axis=1)], axis=1)

    if return_hazards:
        cols = all_hazards.columns[all_hazards.columns.str.startswith("hazard_")]
        cols = cols.difference(overall.columns)
        if len(cols) > 0:
            overall = pd.concat([overall, all_hazards[cols]], axis=1)
    return overall

`predict_prob_event_j_all(df, event)` ¤

This function adds columns of a specific event occurrence probabilities.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns	required
`event`	`Union[str, int]`	event name	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with probabilities columns

Source code in src/pydts/base_fitters.py

def predict_prob_event_j_all(self, df: pd.DataFrame, event: Union[str, int]) -> pd.DataFrame:
    """
    This function adds columns of a specific event occurrence probabilities.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        event (Union[str, int]): event name

    Returns:
        df (pandas.DataFrame): dataframe with probabilities columns

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    if f'overall_survival_t{self.times[-2]}' not in df.columns:
        df = self.predict_overall_survival(df, return_hazards=True)
    for t in self.times[:-1]:
        if f'prob_j{event}_at_t{t}' not in df.columns:
            df = self.predict_prob_event_j_at_t(df=df, event=event, t=t)
    return df

`predict_prob_event_j_at_t(df, event, t)` ¤

This function adds a column with probability of occurance of a specific event at a specific a time.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	dataframe with covariates columns	required
`event`	`Union[str, int]`	event name	required
`t`	`int`	time	required

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe an additional probability column

Source code in src/pydts/base_fitters.py

def predict_prob_event_j_at_t(self, df: pd.DataFrame, event: Union[str, int], t: int) -> pd.DataFrame:
    """
    This function adds a column with probability of occurance of a specific event at a specific a time.

    Args:
        df (pandas.DataFrame): dataframe with covariates columns
        event (Union[str, int]): event name
        t (int): time

    Returns:
        df (pandas.DataFrame): dataframe an additional probability column

    """
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_t(t, return_iter=False)
    self._validate_covariates_in_df(df.head())

    if f'prob_j{event}_at_t{t}' not in df.columns:
        if t == 1:
            if f'hazard_j{event}_t{t}' not in df.columns:
                df = self.predict_hazard_jt(df=df, event=event, t=t)
            df[f'prob_j{event}_at_t{t}'] = df[f'hazard_j{event}_t{t}']
            return df
        elif not f'overall_survival_t{t - 1}' in df.columns:
            df = self.predict_overall_survival(df, t=t, return_hazards=True)
        elif not f'hazard_j{event}_t{t}' in df.columns:
            df = self.predict_hazard_t(df, t=np.array([_t for _t in self.times[:-1] if _t <= t]))
        df[f'prob_j{event}_at_t{t}'] = df[f'overall_survival_t{t - 1}'] * df[f'hazard_j{event}_t{t}']
    return df

`predict_prob_events(df)` ¤

This function adds columns of all the events occurance probabilities. Args: df (pandas.DataFrame): dataframe with covariates columns

Returns:

Name	Type	Description
`df`	`DataFrame`	dataframe with probabilities columns

Source code in src/pydts/base_fitters.py

def predict_prob_events(self, df: pd.DataFrame) -> pd.DataFrame:
    """
    This function adds columns of all the events occurance probabilities.
    Args:
        df (pandas.DataFrame): dataframe with covariates columns

    Returns:
        df (pandas.DataFrame): dataframe with probabilities columns

    """
    self._validate_covariates_in_df(df.head())

    for event in self.events:
        df = self.predict_prob_event_j_all(df=df, event=event)
    return df

`print_summary(summary_func='print_summary', summary_kwargs={})` ¤

This method prints the summary of the fitted models for all the events.

Parameters:

Name	Type	Description	Default
`summary_func`	`(str, Optional)`	print summary method of the fitted model type ("summary", "print_summary").	`'print_summary'`
`summary_kwargs`	`(dict, Optional)`	Keyword arguments to pass to the model summary function.	`{}`

Returns:

Type	Description
`None`	None

Source code in src/pydts/fitters.py

def print_summary(self,
                  summary_func: str = "print_summary",
                  summary_kwargs: dict = {}) -> None:
    """
    This method prints the summary of the fitted models for all the events.

    Args:
        summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
        summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.

    Returns:
        None
    """
    from IPython.display import display
    display(self.get_beta_SE())

    for event, model in self.event_models.items():
        print(f'\n\nModel summary for event: {event}')
        display(model[1].set_index([self.event_type_col, self.duration_col]))

The Two Stages Procedure of Meir and Gorfine (2023) - Efron

pydts.fitters.TwoStagesFitter() ¤

alpha_df = pd.DataFrame() instance-attribute ¤

beta_models = {} instance-attribute ¤

beta_models_params_attr = 'params_' instance-attribute ¤

covariates = None instance-attribute ¤

duration_col = None instance-attribute ¤

event_models = {} instance-attribute ¤

event_type_col = None instance-attribute ¤

events = None instance-attribute ¤

expanded_df = pd.DataFrame() instance-attribute ¤

formula = None instance-attribute ¤

pid_col = None instance-attribute ¤

times = None instance-attribute ¤

_alpha_jt(x, df, y_t, beta_j, n_jt, t, event) ¤

_expand_data(df, event_type_col, duration_col, pid_col) ¤

_fit_beta(expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}) ¤

_fit_event_beta(expanded_df, event, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={}) ¤

_hazard_inverse_transformation(a) ¤

_hazard_transformation(a) ¤

_validate_cols(df, event_type_col, duration_col, pid_col) ¤

_validate_covariates_in_df(df) ¤

_validate_t(t, return_iter=True) ¤

evaluate(test_df, oracle_col='T', **kwargs) ¤

fit(df, covariates=None, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, x0=0, fit_beta_kwargs={}, verbose=2, nb_workers=WORKERS) ¤

get_alpha_df() ¤

get_beta_SE() ¤

plot_all_events_alpha(ax=None, scatter_kwargs={}, colors=COLORS, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, ticklabelsize=15) ¤

plot_all_events_beta(ax=None, colors=COLORS, show=True, title=None, xlabel='Value', ylabel='$\\beta_{j}$', fontsize=18, ticklabelsize=15) ¤

plot_event_alpha(event, ax=None, scatter_kwargs={}, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, color=None, label=None, ticklabelsize=15) ¤

predict(df, **kwargs) ¤

predict_cumulative_incident_function(df) ¤

predict_event_cumulative_incident_function(df, event) ¤

predict_hazard_all(df) ¤

predict_hazard_jt(df, event, t) ¤

predict_hazard_t(df, t) ¤

predict_marginal_prob_all_events(df) ¤

predict_marginal_prob_event_j(df, event) ¤

predict_overall_survival(df, t=None, return_hazards=False) ¤

predict_prob_event_j_all(df, event) ¤

predict_prob_event_j_at_t(df, event, t) ¤

predict_prob_events(df) ¤

print_summary(summary_func='print_summary', summary_kwargs={}) ¤

`pydts.fitters.TwoStagesFitter()` ¤

`alpha_df = pd.DataFrame()` `instance-attribute` ¤

`beta_models = {}` `instance-attribute` ¤

`beta_models_params_attr = 'params_'` `instance-attribute` ¤

`covariates = None` `instance-attribute` ¤

`duration_col = None` `instance-attribute` ¤

`event_models = {}` `instance-attribute` ¤

`event_type_col = None` `instance-attribute` ¤

`events = None` `instance-attribute` ¤

`expanded_df = pd.DataFrame()` `instance-attribute` ¤

`formula = None` `instance-attribute` ¤

`pid_col = None` `instance-attribute` ¤

`times = None` `instance-attribute` ¤

`_alpha_jt(x, df, y_t, beta_j, n_jt, t, event)` ¤

`_expand_data(df, event_type_col, duration_col, pid_col)` ¤

`_fit_beta(expanded_df, events, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={})` ¤

`_fit_event_beta(expanded_df, event, model=CoxPHFitter, model_kwargs={}, model_fit_kwargs={})` ¤

`_hazard_inverse_transformation(a)` ¤

`_hazard_transformation(a)` ¤

`_validate_cols(df, event_type_col, duration_col, pid_col)` ¤

`_validate_covariates_in_df(df)` ¤

`_validate_t(t, return_iter=True)` ¤

`evaluate(test_df, oracle_col='T', **kwargs)` ¤

`fit(df, covariates=None, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, x0=0, fit_beta_kwargs={}, verbose=2, nb_workers=WORKERS)` ¤

`get_alpha_df()` ¤

`get_beta_SE()` ¤

`plot_all_events_alpha(ax=None, scatter_kwargs={}, colors=COLORS, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, ticklabelsize=15)` ¤

`plot_all_events_beta(ax=None, colors=COLORS, show=True, title=None, xlabel='Value', ylabel='$\\beta_{j}$', fontsize=18, ticklabelsize=15)` ¤

`plot_event_alpha(event, ax=None, scatter_kwargs={}, show=True, title=None, xlabel='t', ylabel='$\\alpha_{jt}$', fontsize=18, color=None, label=None, ticklabelsize=15)` ¤

`predict(df, **kwargs)` ¤

`predict_cumulative_incident_function(df)` ¤

`predict_event_cumulative_incident_function(df, event)` ¤

`predict_hazard_all(df)` ¤

`predict_hazard_jt(df, event, t)` ¤

`predict_hazard_t(df, t)` ¤

`predict_marginal_prob_all_events(df)` ¤

`predict_marginal_prob_event_j(df, event)` ¤

`predict_overall_survival(df, t=None, return_hazards=False)` ¤

`predict_prob_event_j_all(df, event)` ¤

`predict_prob_event_j_at_t(df, event, t)` ¤

`predict_prob_events(df)` ¤

`print_summary(summary_func='print_summary', summary_kwargs={})` ¤