Data Expansion Procedure of Lee et al. (2018)
This class implements the estimation procedure of Lee et al. (2018) [1]. See also the Example section.
Examples:
References
[1] Lee, Minjung and Feuer, Eric J. and Fine, Jason P., "On the analysis of discrete time competing risks data", Biometrics (2018) doi: 10.1111/biom.12881
Source code in pydts/fitters.py
class DataExpansionFitter(ExpansionBasedFitter):
"""
This class implements the estimation procedure of Lee et al. (2018) [1].
See also the Example section.
Example:
```py linenums="1"
from pydts.fitters import DataExpansionFitter
fitter = DataExpansionFitter()
fitter.fit(df=train_df, event_type_col='J', duration_col='X')
fitter.print_summary()
```
References:
[1] Lee, Minjung and Feuer, Eric J. and Fine, Jason P., "On the analysis of discrete time competing risks data", Biometrics (2018) doi: 10.1111/biom.12881
"""
def __init__(self):
super().__init__()
self.models_kwargs = dict(family=sm.families.Binomial())
def _fit_event(self, model_fit_kwargs={}):
"""
This method fits a model for a GLM model for a specific event.
Args:
model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.
Returns:
fitted GLM model
"""
model = sm.GLM.from_formula(formula=self.formula, data=self.expanded_df, **self.models_kwargs)
return model.fit(**model_fit_kwargs)
def fit(self,
df: pd.DataFrame,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
skip_expansion: bool = False,
covariates: Optional[list] = None,
formula: Optional[str] = None,
models_kwargs: Optional[dict] = None,
model_fit_kwargs: Optional[dict] = {}) -> dict:
"""
This method fits a model to the discrete data.
Args:
df (pd.DataFrame): training data for fitting the model
event_type_col (str): The event type column name (must be a column in df),
Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in df).
pid_col (str): Sample ID column name (must be a column in df).
skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]).
When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
covariates (list, Optional): A list of covariates, all must be columns in df.
Defaults to all the columns of df except event_type_col, duration_col, and pid_col.
formula (str, Optional): Model formula to be fitted. Patsy format string.
models_kwargs (dict, Optional): Keyword arguments to pass to model instance initiation.
model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.
Returns:
event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.
References:
[1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186
"""
if models_kwargs is not None:
self.models_kwargs = models_kwargs
if 'C' in df.columns:
raise ValueError('C is an invalid column name, to avoid errors with categorical symbol C() in formula')
self._validate_cols(df, event_type_col, duration_col, pid_col)
if covariates is not None:
cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
if len(cov_not_in_df) > 0:
raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")
self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
self.covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]] \
if covariates is None else covariates
self.times = sorted(df[duration_col].unique())
if not skip_expansion:
self.expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
pid_col=pid_col)
else:
print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
self.expanded_df = df
for event in self.events:
cov = ' + '.join(self.covariates)
_formula = f'j_{event} ~ {formula}' if formula is not None else \
f'j_{event} ~ {cov} + C({duration_col}) -1 '
self.formula = _formula
self.event_models[event] = self._fit_event(model_fit_kwargs=model_fit_kwargs)
return self.event_models
def print_summary(self,
summary_func: str = "summary",
summary_kwargs: dict = {}) -> None:
"""
This method prints the summary of the fitted models for all the events.
Args:
summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.
Returns:
None
"""
for event, model in self.event_models.items():
_summary_func = getattr(model, summary_func, None)
if _summary_func is not None:
print(f'\n\nModel summary for event: {event}')
print(_summary_func(**summary_kwargs))
else:
print(f'Not {summary_func} function in event {event} model')
def predict_hazard_jt(self,
df: pd.DataFrame,
event: Union[str, int],
t: Union[Iterable, int],
n_jobs: int = -1) -> pd.DataFrame:
"""
This method calculates the hazard for the given event at the given time values if they were included in
the training set of the event.
Args:
df (pd.DataFrame): samples to predict for
event (Union[str, int]): event name
t (np.array): times to calculate the hazard for
n_jobs: number of CPUs to use, defualt to every available CPU
Returns:
df (pd.DataFrame): samples with the prediction columns
"""
t = self._validate_t(t, return_iter=True)
assert event in self.events, \
f"Cannot predict for event {event} - it was not included during .fit()"
self._validate_covariates_in_df(df.head())
_t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
if len(_t) == 0:
return df
temp_df = df.copy()
model = self.event_models[event]
res = Parallel(n_jobs=n_jobs)(delayed(model.predict)(df[self.covariates].assign(X=c)) for c in t)
temp_hazard_df = pd.concat(res, axis=1)
temp_df[[f'hazard_j{event}_t{c_}' for c_ in t]] = temp_hazard_df.values
return temp_df
def get_beta_SE(self):
"""
This function returns the Beta coefficients and their Standard Errors for all the events.
Returns:
se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe
"""
full_table = pd.DataFrame()
for event in self.events:
summary = self.event_models[event].summary()
summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
summary_df.columns = summary_df.iloc[0]
summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
full_table = pd.concat([full_table, summary_df.iloc[-len(self.covariates):]], axis=1)
return full_table
def get_alpha_df(self):
"""
This function returns the Alpha coefficients and their Standard Errors for all the events.
Returns:
se_df (pandas.DataFrame): Alpha coefficients and Standard Errors Dataframe
"""
full_table = pd.DataFrame()
for event in self.events:
summary = self.event_models[event].summary()
summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
summary_df.columns = summary_df.iloc[0]
summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
full_table = pd.concat([full_table, summary_df.iloc[:-len(self.covariates)-1]], axis=1)
return full_table
fit(self, df, event_type_col='J', duration_col='X', pid_col='pid', skip_expansion=False, covariates=None, formula=None, models_kwargs=None, model_fit_kwargs={})
¤
This method fits a model to the discrete data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
pd.DataFrame |
training data for fitting the model |
required |
event_type_col |
str |
The event type column name (must be a column in df), Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0. |
'J' |
duration_col |
str |
Last follow up time column name (must be a column in df). |
'X' |
pid_col |
str |
Sample ID column name (must be a column in df). |
'pid' |
skip_expansion |
boolean |
Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]). When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data. |
False |
covariates |
list, Optional |
A list of covariates, all must be columns in df. Defaults to all the columns of df except event_type_col, duration_col, and pid_col. |
None |
formula |
str, Optional |
Model formula to be fitted. Patsy format string. |
None |
models_kwargs |
dict, Optional |
Keyword arguments to pass to model instance initiation. |
None |
model_fit_kwargs |
dict, Optional |
Keyword arguments to pass to model.fit() method. |
{} |
Returns:
Type | Description |
---|---|
event_models (dict) |
Fitted models dictionary. Keys - event names, Values - fitted models for the event. |
References
[1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186
Source code in pydts/fitters.py
def fit(self,
df: pd.DataFrame,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
skip_expansion: bool = False,
covariates: Optional[list] = None,
formula: Optional[str] = None,
models_kwargs: Optional[dict] = None,
model_fit_kwargs: Optional[dict] = {}) -> dict:
"""
This method fits a model to the discrete data.
Args:
df (pd.DataFrame): training data for fitting the model
event_type_col (str): The event type column name (must be a column in df),
Right censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in df).
pid_col (str): Sample ID column name (must be a column in df).
skip_expansion (boolean): Skips the dataframe expansion step. Use this option only if the provided dataframe (df) is already correctly expanded (see [1]).
When set to True, the df is expected to be in the format produced by the pydts.utils.get_expanded_df() method, as if it were applied to the unexpanded data.
covariates (list, Optional): A list of covariates, all must be columns in df.
Defaults to all the columns of df except event_type_col, duration_col, and pid_col.
formula (str, Optional): Model formula to be fitted. Patsy format string.
models_kwargs (dict, Optional): Keyword arguments to pass to model instance initiation.
model_fit_kwargs (dict, Optional): Keyword arguments to pass to model.fit() method.
Returns:
event_models (dict): Fitted models dictionary. Keys - event names, Values - fitted models for the event.
References:
[1] Meir, Tomer and Gorfine, Malka, "Discrete-time Competing-Risks Regression with or without Penalization", https://arxiv.org/abs/2303.01186
"""
if models_kwargs is not None:
self.models_kwargs = models_kwargs
if 'C' in df.columns:
raise ValueError('C is an invalid column name, to avoid errors with categorical symbol C() in formula')
self._validate_cols(df, event_type_col, duration_col, pid_col)
if covariates is not None:
cov_not_in_df = [cov for cov in covariates if cov not in df.columns]
if len(cov_not_in_df) > 0:
raise ValueError(f"Error during fit - missing covariates from df: {cov_not_in_df}")
self.events = [c for c in sorted(df[event_type_col].unique()) if c != 0]
self.covariates = [col for col in df if col not in [event_type_col, duration_col, pid_col]] \
if covariates is None else covariates
self.times = sorted(df[duration_col].unique())
if not skip_expansion:
self.expanded_df = self._expand_data(df=df, event_type_col=event_type_col, duration_col=duration_col,
pid_col=pid_col)
else:
print('Skipping data expansion step, only use this option if the provided dataframe (df) is already correctly expanded.')
self.expanded_df = df
for event in self.events:
cov = ' + '.join(self.covariates)
_formula = f'j_{event} ~ {formula}' if formula is not None else \
f'j_{event} ~ {cov} + C({duration_col}) -1 '
self.formula = _formula
self.event_models[event] = self._fit_event(model_fit_kwargs=model_fit_kwargs)
return self.event_models
get_alpha_df(self)
¤
This function returns the Alpha coefficients and their Standard Errors for all the events.
Returns:
Type | Description |
---|---|
se_df (pandas.DataFrame) |
Alpha coefficients and Standard Errors Dataframe |
Source code in pydts/fitters.py
def get_alpha_df(self):
"""
This function returns the Alpha coefficients and their Standard Errors for all the events.
Returns:
se_df (pandas.DataFrame): Alpha coefficients and Standard Errors Dataframe
"""
full_table = pd.DataFrame()
for event in self.events:
summary = self.event_models[event].summary()
summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
summary_df.columns = summary_df.iloc[0]
summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
full_table = pd.concat([full_table, summary_df.iloc[:-len(self.covariates)-1]], axis=1)
return full_table
get_beta_SE(self)
¤
This function returns the Beta coefficients and their Standard Errors for all the events.
Returns:
Type | Description |
---|---|
se_df (pandas.DataFrame) |
Beta coefficients and Standard Errors Dataframe |
Source code in pydts/fitters.py
def get_beta_SE(self):
"""
This function returns the Beta coefficients and their Standard Errors for all the events.
Returns:
se_df (pandas.DataFrame): Beta coefficients and Standard Errors Dataframe
"""
full_table = pd.DataFrame()
for event in self.events:
summary = self.event_models[event].summary()
summary_df = pd.DataFrame([x.split(',') for x in summary.tables[1].as_csv().split('\n')])
summary_df.columns = summary_df.iloc[0]
summary_df = summary_df.iloc[1:].set_index(summary_df.columns[0])
summary_df.columns = pd.MultiIndex.from_product([[event], summary_df.columns])
full_table = pd.concat([full_table, summary_df.iloc[-len(self.covariates):]], axis=1)
return full_table
predict_hazard_jt(self, df, event, t, n_jobs=-1)
¤
This method calculates the hazard for the given event at the given time values if they were included in the training set of the event.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
pd.DataFrame |
samples to predict for |
required |
event |
Union[str, int] |
event name |
required |
t |
np.array |
times to calculate the hazard for |
required |
n_jobs |
int |
number of CPUs to use, defualt to every available CPU |
-1 |
Returns:
Type | Description |
---|---|
df (pd.DataFrame) |
samples with the prediction columns |
Source code in pydts/fitters.py
def predict_hazard_jt(self,
df: pd.DataFrame,
event: Union[str, int],
t: Union[Iterable, int],
n_jobs: int = -1) -> pd.DataFrame:
"""
This method calculates the hazard for the given event at the given time values if they were included in
the training set of the event.
Args:
df (pd.DataFrame): samples to predict for
event (Union[str, int]): event name
t (np.array): times to calculate the hazard for
n_jobs: number of CPUs to use, defualt to every available CPU
Returns:
df (pd.DataFrame): samples with the prediction columns
"""
t = self._validate_t(t, return_iter=True)
assert event in self.events, \
f"Cannot predict for event {event} - it was not included during .fit()"
self._validate_covariates_in_df(df.head())
_t = np.array([t_i for t_i in t if (f'hazard_j{event}_t{t_i}' not in df.columns)])
if len(_t) == 0:
return df
temp_df = df.copy()
model = self.event_models[event]
res = Parallel(n_jobs=n_jobs)(delayed(model.predict)(df[self.covariates].assign(X=c)) for c in t)
temp_hazard_df = pd.concat(res, axis=1)
temp_df[[f'hazard_j{event}_t{c_}' for c_ in t]] = temp_hazard_df.values
return temp_df
print_summary(self, summary_func='summary', summary_kwargs={})
¤
This method prints the summary of the fitted models for all the events.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
summary_func |
str, Optional |
print summary method of the fitted model type ("summary", "print_summary"). |
'summary' |
summary_kwargs |
dict, Optional |
Keyword arguments to pass to the model summary function. |
{} |
Returns:
Type | Description |
---|---|
None |
None |
Source code in pydts/fitters.py
def print_summary(self,
summary_func: str = "summary",
summary_kwargs: dict = {}) -> None:
"""
This method prints the summary of the fitted models for all the events.
Args:
summary_func (str, Optional): print summary method of the fitted model type ("summary", "print_summary").
summary_kwargs (dict, Optional): Keyword arguments to pass to the model summary function.
Returns:
None
"""
for event, model in self.event_models.items():
_summary_func = getattr(model, summary_func, None)
if _summary_func is not None:
print(f'\n\nModel summary for event: {event}')
print(_summary_func(**summary_kwargs))
else:
print(f'Not {summary_func} function in event {event} model')