Skip to content

Data Expansion Procedure of Lee et al. (2018)

This class implements the estimation procedure of Lee et al. (2018) [1]. See also the Example section.

Examples:

1
2
3
4
    from pydts.fitters import DataExpansionFitter
    fitter = DataExpansionFitter()
    fitter.fit(df=train_df, event_type_col='J', duration_col='X')
    fitter.print_summary()

References

[1] Lee, Minjung and Feuer, Eric J. and Fine, Jason P., "On the analysis of discrete time competing risks data", Biometrics (2018) doi: 10.1111/biom.12881

Source code in pydts/fitters.py
class DataExpansionFitter(ExpansionBasedFitter):
    """
    This class implements the estimation procedure of Lee et al. (2018) [1].
    See also the Example section.

    Example:
        ```py linenums="1"
            from pydts.fitters import DataExpansionFitter
            fitter = DataExpansionFitter()
            fitter.fit(df=train_df, event_type_col='J', duration_col='X')
            fitter.print_summary()
        ```

    References:
        [1] Lee, Minjung and Feuer, Eric J. and Fine, Jason P., "On the analysis of discrete time competing risks data", Biometrics (2018) doi: 10.1111/biom.12881
    """

    def __init__(self):
        super().__init__()
        self.models_kwargs = dict(family=sm.families.Binomial())

    def _fit_event(self, model_fit_kwargs={}):
        """
        This method fits a model for a GLM model for a specific event.

        Args:
            model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.

        Returns:
            fitted GLM model
        """
        model = sm.GLM.from_formula(formula=self.formula, data=self.expanded_df, **self.models_kwargs)
        return model.fit(**model_fit_kwargs)

    def fit(self,
            df: pd.DataFrame,
            event_type_col: str = 'J',
            duration_col: str = 'X',
            pid_col: str = 'pid',
            skip_expansion: bool = False,
            covariates: Optional[list] = None,
            formula: Optional[str] = None,
            models_kwargs: Optional[dict] = None,
            model_fit_kwargs: Optional[dict] = {}) -> dict:
        """
        This method fits a model to the discrete data.

        Args:
            df (pd.DataFrame): training data for fitting the model
            event_type_col (str): The event type column name (must be a column in df),
                                  Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
            duration_col (str): Last follow up time column name (must be a column in df).
            pid_col (str): Sample ID column name (must be a column in df).
            skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]).
                                      When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
            covariates (list, Optional): A list of covariates, all must be columns in df.
                                         Defaults to all the columns of df except event_type_col, duration_col, and pid_col.
            formula (str, Optional): Model formula to be fitted. Patsy format string.
            models_kwargs (dict, Optional): Keyword arguments to pass to model instance initiation.
            model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.

        Returns:
            event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.

        References:
            [1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186
        """

        if models_kwargs is not None:
            self.models_kwargs = models_kwargs

        if 'C' in df.columns:
            raise ValueError('C is an invalid column name, to avoid errors with categorical symbol C() in formula')
        self._validate_cols(df, event_type_col, duration_col, pid_col)
        if covariates is not None:
            cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
            if len(cov_not_in_df) > 0:
                raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")

        self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
        self.covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]] \
                          if covariates is None else covariates
        self.times = sorted(df[duration_col].unique())

        if not skip_expansion:
            self.expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
                                                 pid_col=pid_col)
        else:
            print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
            self.expanded_df = df

        for event in self.events:
            cov = ' + '.join(self.covariates)
            _formula = f'j_{event} ~ {formula}' if formula is not None else \
                f'j_{event} ~ {cov} + C({duration_col}) -1 '
            self.formula = _formula
            self.event_models[event] = self._fit_event(model_fit_kwargs=model_fit_kwargs)
        return self.event_models

    def print_summary(self,
                      summary_func: str = "summary",
                      summary_kwargs: dict = {}) -> None:
        """
        This method prints the summary of the fitted models for all the events.

        Args:
            summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
            summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.

        Returns:
            None
        """
        for event, model in self.event_models.items():
            _summary_func = getattr(model, summary_func, None)
            if _summary_func is not None:
                print(f'\n\nModel summary for event: {event}')
                print(_summary_func(**summary_kwargs))
            else:
                print(f'Not {summary_func} function in event {event} model')

    def predict_hazard_jt(self,
                          df: pd.DataFrame,
                          event: Union[str, int],
                          t: Union[Iterable, int],
                          n_jobs: int = -1) -> pd.DataFrame:
        """
        This method calculates the hazard for the given event at the given time values if they were included in
        the training set of the event.

        Args:
            df (pd.DataFrame): samples to predict for
            event (Union[str, int]): event name
            t (np.array): times to calculate the hazard for
            n_jobs: number of CPUs to use, defualt to every available CPU
        Returns:
            df (pd.DataFrame): samples with the prediction columns
        """
        t = self._validate_t(t, return_iter=True)
        assert event in self.events, \
            f"Cannot predict for event {event} - it was not included during .fit()"
        self._validate_covariates_in_df(df.head())

        _t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
        if len(_t) == 0:
            return df

        temp_df = df.copy()
        model = self.event_models[event]
        res = Parallel(n_jobs=n_jobs)(delayed(model.predict)(df[self.covariates].assign(X=c)) for c in t)
        temp_hazard_df = pd.concat(res, axis=1)
        temp_df[[f'hazard_j{event}_t{c_}' for c_ in t]] = temp_hazard_df.values
        return temp_df

    def get_beta_SE(self):
        """
        This function returns the Beta coefficients and their Standard Errors for all the events.

        Returns:
            se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe

        """

        full_table = pd.DataFrame()
        for event in self.events:
            summary = self.event_models[event].summary()
            summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
            summary_df.columns = summary_df.iloc[0]
            summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
            summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
            full_table = pd.concat([full_table, summary_df.iloc[-len(self.covariates):]], axis=1)
        return full_table

    def get_alpha_df(self):
        """
        This function returns the Alpha coefficients and their Standard Errors for all the events.

        Returns:
            se_df (pandas.DataFrame): Alpha coefficients and Standard Errors Dataframe

        """

        full_table = pd.DataFrame()
        for event in self.events:
            summary = self.event_models[event].summary()
            summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
            summary_df.columns = summary_df.iloc[0]
            summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
            summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
            full_table = pd.concat([full_table, summary_df.iloc[:-len(self.covariates)-1]], axis=1)
        return full_table

fit(self, df, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, covariates=None, formula=None, models_kwargs=None, model_fit_kwargs={}) ¤

This method fits a model to the discrete data.

Parameters:

Name Type Description Default
df pd.DataFrame

training data for fitting the model

required
event_type_col str

The event type column name (must be a column in df), Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.

'J'
duration_col str

Last follow up time column name (must be a column in df).

'X'
pid_col str

Sample ID column name (must be a column in df).

'pid'
skip_expansion boolean

Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]). When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.

False
covariates list, Optional

A list of covariates, all must be columns in df. Defaults to all the columns of df except event_type_col, duration_col, and pid_col.

None
formula str, Optional

Model formula to be fitted. Patsy format string.

None
models_kwargs dict, Optional

Keyword arguments to pass to model instance initiation.

None
model_fit_kwargs dict, Optional

Keyword arguments to pass to model.fit() method.

{}

Returns:

Type Description
event_models (dict)

Fitted models dictionary. Keys - event names, Values - fitted models for the event.

References

[1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186

Source code in pydts/fitters.py
def fit(self,
        df: pd.DataFrame,
        event_type_col: str = 'J',
        duration_col: str = 'X',
        pid_col: str = 'pid',
        skip_expansion: bool = False,
        covariates: Optional[list] = None,
        formula: Optional[str] = None,
        models_kwargs: Optional[dict] = None,
        model_fit_kwargs: Optional[dict] = {}) -> dict:
    """
    This method fits a model to the discrete data.

    Args:
        df (pd.DataFrame): training data for fitting the model
        event_type_col (str): The event type column name (must be a column in df),
                              Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).
        skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]).
                                  When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
        covariates (list, Optional): A list of covariates, all must be columns in df.
                                     Defaults to all the columns of df except event_type_col, duration_col, and pid_col.
        formula (str, Optional): Model formula to be fitted. Patsy format string.
        models_kwargs (dict, Optional): Keyword arguments to pass to model instance initiation.
        model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.

    Returns:
        event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.

    References:
        [1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186
    """

    if models_kwargs is not None:
        self.models_kwargs = models_kwargs

    if 'C' in df.columns:
        raise ValueError('C is an invalid column name, to avoid errors with categorical symbol C() in formula')
    self._validate_cols(df, event_type_col, duration_col, pid_col)
    if covariates is not None:
        cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
        if len(cov_not_in_df) > 0:
            raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")

    self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
    self.covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]] \
                      if covariates is None else covariates
    self.times = sorted(df[duration_col].unique())

    if not skip_expansion:
        self.expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
                                             pid_col=pid_col)
    else:
        print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
        self.expanded_df = df

    for event in self.events:
        cov = ' + '.join(self.covariates)
        _formula = f'j_{event} ~ {formula}' if formula is not None else \
            f'j_{event} ~ {cov} + C({duration_col}) -1 '
        self.formula = _formula
        self.event_models[event] = self._fit_event(model_fit_kwargs=model_fit_kwargs)
    return self.event_models

get_alpha_df(self) ¤

This function returns the Alpha coefficients and their Standard Errors for all the events.

Returns:

Type Description
se_df (pandas.DataFrame)

Alpha coefficients and Standard Errors Dataframe

Source code in pydts/fitters.py
def get_alpha_df(self):
    """
    This function returns the Alpha coefficients and their Standard Errors for all the events.

    Returns:
        se_df (pandas.DataFrame): Alpha coefficients and Standard Errors Dataframe

    """

    full_table = pd.DataFrame()
    for event in self.events:
        summary = self.event_models[event].summary()
        summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
        summary_df.columns = summary_df.iloc[0]
        summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
        summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
        full_table = pd.concat([full_table, summary_df.iloc[:-len(self.covariates)-1]], axis=1)
    return full_table

get_beta_SE(self) ¤

This function returns the Beta coefficients and their Standard Errors for all the events.

Returns:

Type Description
se_df (pandas.DataFrame)

Beta coefficients and Standard Errors Dataframe

Source code in pydts/fitters.py
def get_beta_SE(self):
    """
    This function returns the Beta coefficients and their Standard Errors for all the events.

    Returns:
        se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe

    """

    full_table = pd.DataFrame()
    for event in self.events:
        summary = self.event_models[event].summary()
        summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
        summary_df.columns = summary_df.iloc[0]
        summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
        summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
        full_table = pd.concat([full_table, summary_df.iloc[-len(self.covariates):]], axis=1)
    return full_table

predict_hazard_jt(self, df, event, t, n_jobs=-1) ¤

This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.

Parameters:

Name Type Description Default
df pd.DataFrame

samples to predict for

required
event Union[str, int]

event name

required
t np.array

times to calculate the hazard for

required
n_jobs int

number of CPUs to use, defualt to every available CPU

-1

Returns:

Type Description
df (pd.DataFrame)

samples with the prediction columns

Source code in pydts/fitters.py
def predict_hazard_jt(self,
                      df: pd.DataFrame,
                      event: Union[str, int],
                      t: Union[Iterable, int],
                      n_jobs: int = -1) -> pd.DataFrame:
    """
    This method calculates the hazard for the given event at the given time values if they were included in
    the training set of the event.

    Args:
        df (pd.DataFrame): samples to predict for
        event (Union[str, int]): event name
        t (np.array): times to calculate the hazard for
        n_jobs: number of CPUs to use, defualt to every available CPU
    Returns:
        df (pd.DataFrame): samples with the prediction columns
    """
    t = self._validate_t(t, return_iter=True)
    assert event in self.events, \
        f"Cannot predict for event {event} - it was not included during .fit()"
    self._validate_covariates_in_df(df.head())

    _t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
    if len(_t) == 0:
        return df

    temp_df = df.copy()
    model = self.event_models[event]
    res = Parallel(n_jobs=n_jobs)(delayed(model.predict)(df[self.covariates].assign(X=c)) for c in t)
    temp_hazard_df = pd.concat(res, axis=1)
    temp_df[[f'hazard_j{event}_t{c_}' for c_ in t]] = temp_hazard_df.values
    return temp_df

print_summary(self, summary_func='summary', summary_kwargs={}) ¤

This method prints the summary of the fitted models for all the events.

Parameters:

Name Type Description Default
summary_func str, Optional

print summary method of the fitted model type ("summary", "print_summary").

'summary'
summary_kwargs dict, Optional

Keyword arguments to pass to the model summary function.

{}

Returns:

Type Description
None

None

Source code in pydts/fitters.py
def print_summary(self,
                  summary_func: str = "summary",
                  summary_kwargs: dict = {}) -> None:
    """
    This method prints the summary of the fitted models for all the events.

    Args:
        summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
        summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.

    Returns:
        None
    """
    for event, model in self.event_models.items():
        _summary_func = getattr(model, summary_func, None)
        if _summary_func is not None:
            print(f'\n\nModel summary for event: {event}')
            print(_summary_func(**summary_kwargs))
        else:
            print(f'Not {summary_func} function in event {event} model')