Cross Validation
BasePenaltyGridSearchCV
¤
This class implements K-fold cross-validation of the PenaltyGridSearch
Source code in pydts/cross_validation.py
class BasePenaltyGridSearchCV(object):
"""
This class implements K-fold cross-validation of the PenaltyGridSearch
"""
def __init__(self):
self.folds_grids = {}
self.test_pids = {}
self.global_auc = {}
self.integrated_auc = {}
self.global_bs = {}
self.integrated_bs = {}
self.TwoStagesFitter_type = 'CoxPHFitter'
def cross_validate(self,
full_df: pd.DataFrame,
l1_ratio: float,
penalizers: list,
n_splits: int = 5,
shuffle: bool = True,
seed: Union[int, None] = None,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
twostages_fit_kwargs: dict = {'nb_workers': WORKERS},
metrics=['IBS', 'GBS', 'IAUC', 'GAUC']) -> pd.DataFrame:
"""
This method implements K-fold cross-validation using PenaltyGridSearch and full_df data.
Args:
full_df (pd.DataFrame): Data to cross validate.
l1_ratio (float): regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
penalizers (list): penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
n_splits (int): Number of folds, defaults to 5.
shuffle (boolean): Shuffle samples before splitting to folds. Defaults to True.
seed: Pseudo-random seed to KFold instance. Defaults to None.
event_type_col (str): The event type column name (must be a column in df),
Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in full_df).
pid_col (str): Sample ID column name (must be a column in full_df).
twostages_fit_kwargs (dict): keyword arguments to pass to each TwoStagesFitter.
metrics (str, list): Evaluation metrics. Available metrics:
'IAUC': Integrated AUC (will be in PenaltyGridSearchCV.integrated_auc),
'GAUC': Global AUC (will be in PenaltyGridSearchCV.global_auc).
'IBS': Integrated Brier Score (will be in PenaltyGridSearchCV.integrated_bs),
'GBS': Global Brier Score (will be in PenaltyGridSearchCV.global_bs).
Returns:
gauc_output_df (pd.DataFrame): Global AUC k-fold mean and standard error for all possible combination of the penalizers.
"""
if isinstance(metrics, str):
metrics = [metrics]
self.folds_grids = {}
self.kfold_cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)
if 'C' in full_df.columns:
full_df = full_df.drop(['C'], axis=1)
if 'T' in full_df.columns:
full_df = full_df.drop(['T'], axis=1)
for i_fold, (train_index, test_index) in enumerate(self.kfold_cv.split(full_df)):
print(f'Starting fold {i_fold+1}/{n_splits}')
start = time()
self.test_pids[i_fold] = full_df.iloc[test_index][pid_col].values
train_df, test_df = full_df.iloc[train_index], full_df.iloc[test_index]
if self.TwoStagesFitter_type == 'Exact':
fold_pgs = PenaltyGridSearchExact()
else:
fold_pgs = PenaltyGridSearch()
fold_pgs.evaluate(train_df=train_df,
test_df=test_df,
l1_ratio=l1_ratio,
penalizers=penalizers,
metrics=metrics,
seed=seed,
event_type_col=event_type_col,
duration_col=duration_col,
pid_col=pid_col,
twostages_fit_kwargs=twostages_fit_kwargs)
self.folds_grids[i_fold] = fold_pgs
for metric in metrics:
if metric == 'GAUC':
self.global_auc[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.global_auc)
elif metric == 'IAUC':
self.integrated_auc[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.integrated_auc)
elif metric == 'GBS':
self.global_bs[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.global_bs)
elif metric == 'IBS':
self.integrated_bs[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.integrated_bs)
end = time()
print(f'Finished fold {i_fold+1}/{n_splits}, {int(end-start)} seconds')
if 'GAUC' in metrics:
res = [v for k, v in self.global_auc.items()]
gauc_output_df = pd.concat([pd.concat(res, axis=1).mean(axis=1),
pd.concat(res, axis=1).std(axis=1)],
keys=['Mean', 'SE'], axis=1)
else:
gauc_output_df = pd.DataFrame()
return gauc_output_df
cross_validate(self, full_df, l1_ratio, penalizers, n_splits=5, shuffle=True, seed=None, event_type_col='J', duration_col='X', pid_col='pid', twostages_fit_kwargs={'nb_workers': 2}, metrics=['IBS', 'GBS', 'IAUC', 'GAUC'])
¤
This method implements K-fold cross-validation using PenaltyGridSearch and full_df data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
full_df |
pd.DataFrame |
Data to cross validate. |
required |
l1_ratio |
float |
regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation). |
required |
penalizers |
list |
penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation). |
required |
n_splits |
int |
Number of folds, defaults to 5. |
5 |
shuffle |
boolean |
Shuffle samples before splitting to folds. Defaults to True. |
True |
seed |
Optional[int] |
Pseudo-random seed to KFold instance. Defaults to None. |
None |
event_type_col |
str |
The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0. |
'J' |
duration_col |
str |
Last follow up time column name (must be a column in full_df). |
'X' |
pid_col |
str |
Sample ID column name (must be a column in full_df). |
'pid' |
twostages_fit_kwargs |
dict |
keyword arguments to pass to each TwoStagesFitter. |
{'nb_workers': 2} |
metrics |
str, list |
Evaluation metrics. Available metrics: 'IAUC': Integrated AUC (will be in PenaltyGridSearchCV.integrated_auc), 'GAUC': Global AUC (will be in PenaltyGridSearchCV.global_auc). 'IBS': Integrated Brier Score (will be in PenaltyGridSearchCV.integrated_bs), 'GBS': Global Brier Score (will be in PenaltyGridSearchCV.global_bs). |
['IBS', 'GBS', 'IAUC', 'GAUC'] |
Returns:
Type | Description |
---|---|
gauc_output_df (pd.DataFrame) |
Global AUC k-fold mean and standard error for all possible combination of the penalizers. |
Source code in pydts/cross_validation.py
def cross_validate(self,
full_df: pd.DataFrame,
l1_ratio: float,
penalizers: list,
n_splits: int = 5,
shuffle: bool = True,
seed: Union[int, None] = None,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
twostages_fit_kwargs: dict = {'nb_workers': WORKERS},
metrics=['IBS', 'GBS', 'IAUC', 'GAUC']) -> pd.DataFrame:
"""
This method implements K-fold cross-validation using PenaltyGridSearch and full_df data.
Args:
full_df (pd.DataFrame): Data to cross validate.
l1_ratio (float): regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
penalizers (list): penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
n_splits (int): Number of folds, defaults to 5.
shuffle (boolean): Shuffle samples before splitting to folds. Defaults to True.
seed: Pseudo-random seed to KFold instance. Defaults to None.
event_type_col (str): The event type column name (must be a column in df),
Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in full_df).
pid_col (str): Sample ID column name (must be a column in full_df).
twostages_fit_kwargs (dict): keyword arguments to pass to each TwoStagesFitter.
metrics (str, list): Evaluation metrics. Available metrics:
'IAUC': Integrated AUC (will be in PenaltyGridSearchCV.integrated_auc),
'GAUC': Global AUC (will be in PenaltyGridSearchCV.global_auc).
'IBS': Integrated Brier Score (will be in PenaltyGridSearchCV.integrated_bs),
'GBS': Global Brier Score (will be in PenaltyGridSearchCV.global_bs).
Returns:
gauc_output_df (pd.DataFrame): Global AUC k-fold mean and standard error for all possible combination of the penalizers.
"""
if isinstance(metrics, str):
metrics = [metrics]
self.folds_grids = {}
self.kfold_cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)
if 'C' in full_df.columns:
full_df = full_df.drop(['C'], axis=1)
if 'T' in full_df.columns:
full_df = full_df.drop(['T'], axis=1)
for i_fold, (train_index, test_index) in enumerate(self.kfold_cv.split(full_df)):
print(f'Starting fold {i_fold+1}/{n_splits}')
start = time()
self.test_pids[i_fold] = full_df.iloc[test_index][pid_col].values
train_df, test_df = full_df.iloc[train_index], full_df.iloc[test_index]
if self.TwoStagesFitter_type == 'Exact':
fold_pgs = PenaltyGridSearchExact()
else:
fold_pgs = PenaltyGridSearch()
fold_pgs.evaluate(train_df=train_df,
test_df=test_df,
l1_ratio=l1_ratio,
penalizers=penalizers,
metrics=metrics,
seed=seed,
event_type_col=event_type_col,
duration_col=duration_col,
pid_col=pid_col,
twostages_fit_kwargs=twostages_fit_kwargs)
self.folds_grids[i_fold] = fold_pgs
for metric in metrics:
if metric == 'GAUC':
self.global_auc[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.global_auc)
elif metric == 'IAUC':
self.integrated_auc[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.integrated_auc)
elif metric == 'GBS':
self.global_bs[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.global_bs)
elif metric == 'IBS':
self.integrated_bs[i_fold] = fold_pgs.convert_results_dict_to_df(fold_pgs.integrated_bs)
end = time()
print(f'Finished fold {i_fold+1}/{n_splits}, {int(end-start)} seconds')
if 'GAUC' in metrics:
res = [v for k, v in self.global_auc.items()]
gauc_output_df = pd.concat([pd.concat(res, axis=1).mean(axis=1),
pd.concat(res, axis=1).std(axis=1)],
keys=['Mean', 'SE'], axis=1)
else:
gauc_output_df = pd.DataFrame()
return gauc_output_df
BaseTwoStagesCV
¤
This class implements K-fold cross-validation using TwoStagesFitters and TwoStagesFittersExact
Source code in pydts/cross_validation.py
class BaseTwoStagesCV(object):
"""
This class implements K-fold cross-validation using TwoStagesFitters and TwoStagesFittersExact
"""
def __init__(self):
self.models = {}
self.test_pids = {}
self.results = pd.DataFrame()
self.global_auc = {}
self.integrated_auc = {}
self.global_bs = {}
self.integrated_bs = {}
self.TwoStagesFitter_type = 'CoxPHFitter'
def cross_validate(self,
full_df: pd.DataFrame,
n_splits: int = 5,
shuffle: bool = True,
seed: Union[int, None] = None,
fit_beta_kwargs: dict = {},
covariates=None,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
x0: Union[np.array, int] = 0,
verbose: int = 2,
nb_workers: int = WORKERS,
metrics=['BS', 'IBS', 'GBS', 'AUC', 'IAUC', 'GAUC']):
"""
This method implements K-fold cross-validation using TwoStagesFitters and full_df data.
Args:
full_df (pd.DataFrame): Data to cross validate.
n_splits (int): Number of folds, defaults to 5.
shuffle (boolean): Shuffle samples before splitting to folds. Defaults to True.
seed: Pseudo-random seed to KFold instance. Defaults to None.
fit_beta_kwargs (dict, Optional): Keyword arguments to pass on to the estimation procedure.
If different model for beta is desired, it can be defined here.
covariates (list): list of covariates to be used in estimating the regression coefficients.
event_type_col (str): The event type column name (must be a column in df),
Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in full_df).
pid_col (str): Sample ID column name (must be a column in full_df).
x0 (Union[numpy.array, int], Optional): initial guess to pass to scipy.optimize.minimize function
verbose (int, Optional): The verbosity level of pandaallel
nb_workers (int, Optional): The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.
metrics (str, list): Evaluation metrics. Available metrics:
'AUC': AUC at t (will be added to TwoStagesCV.results),
'IAUC': Integrated AUC (will be in TwoStagesCV.integrated_auc),
'GAUC': Global AUC (will be in TwoStagesCV.global_auc).
'BS': Brier score at t (will be added to TwoStagesCV.results),
'IBS': Integrated Brier Score (will be in TwoStagesCV.integrated_bs),
'GBS': Global Brier Score (will be in TwoStagesCV.global_bs).
Returns:
Results (pd.DataFrame): Cross validation metrics results
"""
if isinstance(metrics, str):
metrics = [metrics]
self.models = {}
self.kfold_cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)
if 'C' in full_df.columns:
full_df = full_df.drop(['C'], axis=1)
if 'T' in full_df.columns:
full_df = full_df.drop(['T'], axis=1)
for i_fold, (train_index, test_index) in enumerate(self.kfold_cv.split(full_df)):
self.test_pids[i_fold] = full_df.iloc[test_index][pid_col].values
train_df, test_df = full_df.iloc[train_index], full_df.iloc[test_index]
if self.TwoStagesFitter_type == 'Exact':
fold_fitter = TwoStagesFitterExact()
else:
fold_fitter = TwoStagesFitter()
print(f'Fitting fold {i_fold+1}/{n_splits}')
fold_fitter.fit(df=train_df,
covariates=covariates,
event_type_col=event_type_col,
duration_col=duration_col,
pid_col=pid_col,
x0=x0,
fit_beta_kwargs=fit_beta_kwargs,
verbose=verbose,
nb_workers=nb_workers)
#self.models[i_fold] = deepcopy(fold_fitter)
self.models[i_fold] = fold_fitter
pred_df = self.models[i_fold].predict_prob_events(test_df)
for metric in metrics:
if metric == 'IAUC':
self.integrated_auc[i_fold] = events_integrated_auc(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'GAUC':
self.global_auc[i_fold] = global_auc(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'IBS':
self.integrated_bs[i_fold] = events_integrated_brier_score(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'GBS':
self.global_bs[i_fold] = global_brier_score(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'AUC':
tmp_res = events_auc_at_t(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
tmp_res = pd.concat([tmp_res], keys=[i_fold], names=['fold'])
tmp_res = pd.concat([tmp_res], keys=[metric], names=['metric'])
self.results = pd.concat([self.results, tmp_res], axis=0)
elif metric == 'BS':
tmp_res = events_brier_score_at_t(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
tmp_res = pd.concat([tmp_res], keys=[i_fold], names=['fold'])
tmp_res = pd.concat([tmp_res], keys=[metric], names=['metric'])
self.results = pd.concat([self.results, tmp_res], axis=0)
return self.results
cross_validate(self, full_df, n_splits=5, shuffle=True, seed=None, fit_beta_kwargs={}, covariates=None, event_type_col='J', duration_col='X', pid_col='pid', x0=0, verbose=2, nb_workers=2, metrics=['BS', 'IBS', 'GBS', 'AUC', 'IAUC', 'GAUC'])
¤
This method implements K-fold cross-validation using TwoStagesFitters and full_df data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
full_df |
pd.DataFrame |
Data to cross validate. |
required |
n_splits |
int |
Number of folds, defaults to 5. |
5 |
shuffle |
boolean |
Shuffle samples before splitting to folds. Defaults to True. |
True |
seed |
Optional[int] |
Pseudo-random seed to KFold instance. Defaults to None. |
None |
fit_beta_kwargs |
dict, Optional |
Keyword arguments to pass on to the estimation procedure. If different model for beta is desired, it can be defined here. |
{} |
covariates |
list |
list of covariates to be used in estimating the regression coefficients. |
None |
event_type_col |
str |
The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0. |
'J' |
duration_col |
str |
Last follow up time column name (must be a column in full_df). |
'X' |
pid_col |
str |
Sample ID column name (must be a column in full_df). |
'pid' |
x0 |
Union[numpy.array, int], Optional |
initial guess to pass to scipy.optimize.minimize function |
0 |
verbose |
int, Optional |
The verbosity level of pandaallel |
2 |
nb_workers |
int, Optional |
The number of workers to pandaallel. If not sepcified, defaults to the number of workers available. |
2 |
metrics |
str, list |
Evaluation metrics. Available metrics: 'AUC': AUC at t (will be added to TwoStagesCV.results), 'IAUC': Integrated AUC (will be in TwoStagesCV.integrated_auc), 'GAUC': Global AUC (will be in TwoStagesCV.global_auc). 'BS': Brier score at t (will be added to TwoStagesCV.results), 'IBS': Integrated Brier Score (will be in TwoStagesCV.integrated_bs), 'GBS': Global Brier Score (will be in TwoStagesCV.global_bs). |
['BS', 'IBS', 'GBS', 'AUC', 'IAUC', 'GAUC'] |
Returns:
Type | Description |
---|---|
Results (pd.DataFrame) |
Cross validation metrics results |
Source code in pydts/cross_validation.py
def cross_validate(self,
full_df: pd.DataFrame,
n_splits: int = 5,
shuffle: bool = True,
seed: Union[int, None] = None,
fit_beta_kwargs: dict = {},
covariates=None,
event_type_col: str = 'J',
duration_col: str = 'X',
pid_col: str = 'pid',
x0: Union[np.array, int] = 0,
verbose: int = 2,
nb_workers: int = WORKERS,
metrics=['BS', 'IBS', 'GBS', 'AUC', 'IAUC', 'GAUC']):
"""
This method implements K-fold cross-validation using TwoStagesFitters and full_df data.
Args:
full_df (pd.DataFrame): Data to cross validate.
n_splits (int): Number of folds, defaults to 5.
shuffle (boolean): Shuffle samples before splitting to folds. Defaults to True.
seed: Pseudo-random seed to KFold instance. Defaults to None.
fit_beta_kwargs (dict, Optional): Keyword arguments to pass on to the estimation procedure.
If different model for beta is desired, it can be defined here.
covariates (list): list of covariates to be used in estimating the regression coefficients.
event_type_col (str): The event type column name (must be a column in df),
Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
duration_col (str): Last follow up time column name (must be a column in full_df).
pid_col (str): Sample ID column name (must be a column in full_df).
x0 (Union[numpy.array, int], Optional): initial guess to pass to scipy.optimize.minimize function
verbose (int, Optional): The verbosity level of pandaallel
nb_workers (int, Optional): The number of workers to pandaallel. If not sepcified, defaults to the number of workers available.
metrics (str, list): Evaluation metrics. Available metrics:
'AUC': AUC at t (will be added to TwoStagesCV.results),
'IAUC': Integrated AUC (will be in TwoStagesCV.integrated_auc),
'GAUC': Global AUC (will be in TwoStagesCV.global_auc).
'BS': Brier score at t (will be added to TwoStagesCV.results),
'IBS': Integrated Brier Score (will be in TwoStagesCV.integrated_bs),
'GBS': Global Brier Score (will be in TwoStagesCV.global_bs).
Returns:
Results (pd.DataFrame): Cross validation metrics results
"""
if isinstance(metrics, str):
metrics = [metrics]
self.models = {}
self.kfold_cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)
if 'C' in full_df.columns:
full_df = full_df.drop(['C'], axis=1)
if 'T' in full_df.columns:
full_df = full_df.drop(['T'], axis=1)
for i_fold, (train_index, test_index) in enumerate(self.kfold_cv.split(full_df)):
self.test_pids[i_fold] = full_df.iloc[test_index][pid_col].values
train_df, test_df = full_df.iloc[train_index], full_df.iloc[test_index]
if self.TwoStagesFitter_type == 'Exact':
fold_fitter = TwoStagesFitterExact()
else:
fold_fitter = TwoStagesFitter()
print(f'Fitting fold {i_fold+1}/{n_splits}')
fold_fitter.fit(df=train_df,
covariates=covariates,
event_type_col=event_type_col,
duration_col=duration_col,
pid_col=pid_col,
x0=x0,
fit_beta_kwargs=fit_beta_kwargs,
verbose=verbose,
nb_workers=nb_workers)
#self.models[i_fold] = deepcopy(fold_fitter)
self.models[i_fold] = fold_fitter
pred_df = self.models[i_fold].predict_prob_events(test_df)
for metric in metrics:
if metric == 'IAUC':
self.integrated_auc[i_fold] = events_integrated_auc(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'GAUC':
self.global_auc[i_fold] = global_auc(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'IBS':
self.integrated_bs[i_fold] = events_integrated_brier_score(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'GBS':
self.global_bs[i_fold] = global_brier_score(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
elif metric == 'AUC':
tmp_res = events_auc_at_t(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
tmp_res = pd.concat([tmp_res], keys=[i_fold], names=['fold'])
tmp_res = pd.concat([tmp_res], keys=[metric], names=['metric'])
self.results = pd.concat([self.results, tmp_res], axis=0)
elif metric == 'BS':
tmp_res = events_brier_score_at_t(pred_df, event_type_col=event_type_col,
duration_col=duration_col)
tmp_res = pd.concat([tmp_res], keys=[i_fold], names=['fold'])
tmp_res = pd.concat([tmp_res], keys=[metric], names=['metric'])
self.results = pd.concat([self.results, tmp_res], axis=0)
return self.results