Skip to content

Model Selection

BasePenaltyGridSearch ¤

This class implements the penalty parameter grid search.

Source code in pydts/model_selection.py
class BasePenaltyGridSearch(object):

    """ This class implements the penalty parameter grid search. """

    def __init__(self):
        self.l1_ratio = None
        self.penalizers = []
        self.seed = None
        self.meta_models = {}
        self.train_df = None
        self.test_df = None
        self.global_auc = {}
        self.integrated_auc = {}
        self.global_bs = {}
        self.integrated_bs = {}
        self.TwoStagesFitter_type = 'CoxPHFitter'

    def evaluate(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 l1_ratio: float,
                 penalizers: list,
                 metrics: Union[list, str] = ['IBS', 'GBS', 'IAUC', 'GAUC'],
                 seed: Union[None, int] = None,
                 event_type_col: str = 'J',
                 duration_col: str = 'X',
                 pid_col: str = 'pid',
                 twostages_fit_kwargs: dict = {}) -> tuple:

        """
        This function implements model estimation using train_df and evaluation of the metrics using test_df to all the possible combinations of penalizers.

        Args:
            train_df (pd.DataFrame): training data for fitting the model.
            test_df (pd.DataFrame): testing data for evaluating the estimated model's performance.
            l1_ratio (float): regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
            penalizers (list): penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
            metrics (str, list): Evaluation metrics. Available metrics:
                                                    'IAUC': Integrated AUC (will be in PenaltyGridSearch.integrated_auc),
                                                    'GAUC': Global AUC (will be in PenaltyGridSearch.global_auc).
                                                    'IBS': Integrated Brier Score (will be in PenaltyGridSearch.integrated_bs),
                                                    'GBS': Global Brier Score (will be in PenaltyGridSearch.global_bs).
            seed (int): pseudo random seed number for numpy.random.seed()
            event_type_col (str): The event type column name (must be a column in df),
                                  Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
            duration_col (str): Last follow up time column name (must be a column in df).
            pid_col (str): Sample ID column name (must be a column in df).
            twostages_fit_kwargs (dict): keyword arguments to pass to the TwoStagesFitter.

        Returns:
            output (Tuple): Penalizers with best performance in terms of Global-AUC, if 'GAUC' is in metrics.

        """

        self.l1_ratio = l1_ratio
        self.penalizers = penalizers
        self.seed = seed
        np.random.seed(seed)

        for idp, penalizer in enumerate(penalizers):

            fit_beta_kwargs = self._get_model_fit_kwargs(penalizer, l1_ratio)

            if self.TwoStagesFitter_type == 'Exact':
                self.meta_models[penalizer] = TwoStagesFitterExact()
            else:
                self.meta_models[penalizer] = TwoStagesFitter()
            print(f"Started estimating the coefficients for penalizer {penalizer} ({idp+1}/{len(penalizers)})")
            start = time()
            self.meta_models[penalizer].fit(df=train_df, fit_beta_kwargs=fit_beta_kwargs,
                                            pid_col=pid_col, event_type_col=event_type_col, duration_col=duration_col,
                                            **twostages_fit_kwargs)
            end = time()
            print(f"Finished estimating the coefficients for penalizer {penalizer} ({idp+1}/{len(penalizers)}), {int(end - start)} seconds")

        events = [j for j in sorted(train_df[event_type_col].unique()) if j != 0]
        grid = [penalizers for e in events]
        penalizers_combinations = list(product(*grid))

        for idc, combination in enumerate(penalizers_combinations):
            mixed_two_stages = self.get_mixed_two_stages_fitter(combination)

            pred_df = mixed_two_stages.predict_prob_events(test_df)

            for metric in metrics:
                if metric == 'IAUC':
                    self.integrated_auc[combination] = events_integrated_auc(pred_df, event_type_col=event_type_col,
                                                                                      duration_col=duration_col)
                elif metric == 'GAUC':
                    self.global_auc[combination] = global_auc(pred_df, event_type_col=event_type_col,
                                                                       duration_col=duration_col)
                elif metric == 'IBS':
                    self.integrated_bs[combination] = events_integrated_brier_score(pred_df,
                                                                                    event_type_col=event_type_col,
                                                                                    duration_col=duration_col)
                elif metric == 'GBS':
                    self.global_bs[combination] = global_brier_score(pred_df, event_type_col=event_type_col,
                                                                              duration_col=duration_col)

        output = self.convert_results_dict_to_df(self.global_auc).idxmax().values[0] if 'GAUC' in metrics else []
        return output

    def convert_results_dict_to_df(self, results_dict):
        """
        This function converts a results dictionary to a pd.DataFrame format.
        Args:
            results_dict: one of the class attributes: global_auc, integrated_auc, global_bs, integrated_bs.

        Returns:
            df (pd.DataFrame): Results in a pd.DataFrame format.
        """
        df = pd.DataFrame(results_dict.values(), index=pd.MultiIndex.from_tuples(results_dict.keys()))
        return df

    def get_mixed_two_stages_fitter(self, penalizers_combination: list) -> TwoStagesFitter:
        """
        This function creates a mixed TwoStagesFitter from the estimated meta models for a specific penalizers combination.

        Args:
            penalizers_combination (list): List with length equals to the number of competing events. The penalizers value to each of the events.
                                           Each of the values must be one of the values that was previously passed to the evaluate() method.

        Returns:
            mixed_two_stages (pydts.fitters.TwoStagesFitter): TwoStagesFitter for the required penalty combination.
        """
        _validate_estimated_value = [p for p in penalizers_combination if p not in list(self.meta_models.keys())]
        assert len(_validate_estimated_value) == 0, \
               f"Values {_validate_estimated_value} were note estimated. All the penalizers in penalizers_combination must be estimated using evaluate() before a mixed combination can be generated."

        events = self.meta_models[penalizers_combination[0]].events
        event_type_col = self.meta_models[penalizers_combination[0]].event_type_col
        if self.TwoStagesFitter_type == 'Exact':
            mixed_two_stages = TwoStagesFitterExact()
        else:
            mixed_two_stages = TwoStagesFitter()

        for ide, event in enumerate(sorted(events)):
            if ide == 0:
                mixed_two_stages.covariates = self.meta_models[penalizers_combination[ide]].covariates
                mixed_two_stages.duration_col = self.meta_models[penalizers_combination[ide]].duration_col
                mixed_two_stages.event_type_col = self.meta_models[penalizers_combination[ide]].event_type_col
                mixed_two_stages.events = self.meta_models[penalizers_combination[ide]].events
                mixed_two_stages.pid_col = self.meta_models[penalizers_combination[ide]].pid_col
                mixed_two_stages.times = self.meta_models[penalizers_combination[ide]].times

            mixed_two_stages.beta_models[event] = self.meta_models[penalizers_combination[ide]].beta_models[event]
            mixed_two_stages.event_models[event] = []
            mixed_two_stages.event_models[event].append(self.meta_models[penalizers_combination[ide]].beta_models[event])

            event_alpha = self.meta_models[penalizers_combination[ide]].alpha_df.copy()
            event_alpha = event_alpha[event_alpha[event_type_col] == event]
            mixed_two_stages.alpha_df = pd.concat([mixed_two_stages.alpha_df, event_alpha])
            mixed_two_stages.event_models[event].append(event_alpha)

        return mixed_two_stages

    def _get_model_fit_kwargs(self, penalizer, l1_ratio):
        if self.TwoStagesFitter_type == 'Exact':
            fit_beta_kwargs = {
                'model_fit_kwargs': {
                    'alpha': penalizer,
                    'L1_wt': l1_ratio
                }
            }
        else:
            fit_beta_kwargs = {
                'model_kwargs': {
                    'penalizer': penalizer,
                    'l1_ratio': l1_ratio
                },
            }
        return fit_beta_kwargs

convert_results_dict_to_df(self, results_dict) ¤

This function converts a results dictionary to a pd.DataFrame format.

Parameters:

Name Type Description Default
results_dict

one of the class attributes: global_auc, integrated_auc, global_bs, integrated_bs.

required

Returns:

Type Description
df (pd.DataFrame)

Results in a pd.DataFrame format.

Source code in pydts/model_selection.py
def convert_results_dict_to_df(self, results_dict):
    """
    This function converts a results dictionary to a pd.DataFrame format.
    Args:
        results_dict: one of the class attributes: global_auc, integrated_auc, global_bs, integrated_bs.

    Returns:
        df (pd.DataFrame): Results in a pd.DataFrame format.
    """
    df = pd.DataFrame(results_dict.values(), index=pd.MultiIndex.from_tuples(results_dict.keys()))
    return df

evaluate(self, train_df, test_df, l1_ratio, penalizers, metrics=['IBS', 'GBS', 'IAUC', 'GAUC'], seed=None, event_type_col='J', duration_col='X', pid_col='pid', twostages_fit_kwargs={}) ¤

This function implements model estimation using train_df and evaluation of the metrics using test_df to all the possible combinations of penalizers.

Parameters:

Name Type Description Default
train_df pd.DataFrame

training data for fitting the model.

required
test_df pd.DataFrame

testing data for evaluating the estimated model's performance.

required
l1_ratio float

regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).

required
penalizers list

penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).

required
metrics str, list

Evaluation metrics. Available metrics: 'IAUC': Integrated AUC (will be in PenaltyGridSearch.integrated_auc), 'GAUC': Global AUC (will be in PenaltyGridSearch.global_auc). 'IBS': Integrated Brier Score (will be in PenaltyGridSearch.integrated_bs), 'GBS': Global Brier Score (will be in PenaltyGridSearch.global_bs).

['IBS', 'GBS', 'IAUC', 'GAUC']
seed int

pseudo random seed number for numpy.random.seed()

None
event_type_col str

The event type column name (must be a column in df), Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.

'J'
duration_col str

Last follow up time column name (must be a column in df).

'X'
pid_col str

Sample ID column name (must be a column in df).

'pid'
twostages_fit_kwargs dict

keyword arguments to pass to the TwoStagesFitter.

{}

Returns:

Type Description
output (Tuple)

Penalizers with best performance in terms of Global-AUC, if 'GAUC' is in metrics.

Source code in pydts/model_selection.py
def evaluate(self,
             train_df: pd.DataFrame,
             test_df: pd.DataFrame,
             l1_ratio: float,
             penalizers: list,
             metrics: Union[list, str] = ['IBS', 'GBS', 'IAUC', 'GAUC'],
             seed: Union[None, int] = None,
             event_type_col: str = 'J',
             duration_col: str = 'X',
             pid_col: str = 'pid',
             twostages_fit_kwargs: dict = {}) -> tuple:

    """
    This function implements model estimation using train_df and evaluation of the metrics using test_df to all the possible combinations of penalizers.

    Args:
        train_df (pd.DataFrame): training data for fitting the model.
        test_df (pd.DataFrame): testing data for evaluating the estimated model's performance.
        l1_ratio (float): regularization ratio for the CoxPHFitter (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
        penalizers (list): penalizer options for each event (see lifelines.fitters.coxph_fitter.CoxPHFitter documentation).
        metrics (str, list): Evaluation metrics. Available metrics:
                                                'IAUC': Integrated AUC (will be in PenaltyGridSearch.integrated_auc),
                                                'GAUC': Global AUC (will be in PenaltyGridSearch.global_auc).
                                                'IBS': Integrated Brier Score (will be in PenaltyGridSearch.integrated_bs),
                                                'GBS': Global Brier Score (will be in PenaltyGridSearch.global_bs).
        seed (int): pseudo random seed number for numpy.random.seed()
        event_type_col (str): The event type column name (must be a column in df),
                              Right-censored sample (i) is indicated by event value 0, df.loc[i, event_type_col] = 0.
        duration_col (str): Last follow up time column name (must be a column in df).
        pid_col (str): Sample ID column name (must be a column in df).
        twostages_fit_kwargs (dict): keyword arguments to pass to the TwoStagesFitter.

    Returns:
        output (Tuple): Penalizers with best performance in terms of Global-AUC, if 'GAUC' is in metrics.

    """

    self.l1_ratio = l1_ratio
    self.penalizers = penalizers
    self.seed = seed
    np.random.seed(seed)

    for idp, penalizer in enumerate(penalizers):

        fit_beta_kwargs = self._get_model_fit_kwargs(penalizer, l1_ratio)

        if self.TwoStagesFitter_type == 'Exact':
            self.meta_models[penalizer] = TwoStagesFitterExact()
        else:
            self.meta_models[penalizer] = TwoStagesFitter()
        print(f"Started estimating the coefficients for penalizer {penalizer} ({idp+1}/{len(penalizers)})")
        start = time()
        self.meta_models[penalizer].fit(df=train_df, fit_beta_kwargs=fit_beta_kwargs,
                                        pid_col=pid_col, event_type_col=event_type_col, duration_col=duration_col,
                                        **twostages_fit_kwargs)
        end = time()
        print(f"Finished estimating the coefficients for penalizer {penalizer} ({idp+1}/{len(penalizers)}), {int(end - start)} seconds")

    events = [j for j in sorted(train_df[event_type_col].unique()) if j != 0]
    grid = [penalizers for e in events]
    penalizers_combinations = list(product(*grid))

    for idc, combination in enumerate(penalizers_combinations):
        mixed_two_stages = self.get_mixed_two_stages_fitter(combination)

        pred_df = mixed_two_stages.predict_prob_events(test_df)

        for metric in metrics:
            if metric == 'IAUC':
                self.integrated_auc[combination] = events_integrated_auc(pred_df, event_type_col=event_type_col,
                                                                                  duration_col=duration_col)
            elif metric == 'GAUC':
                self.global_auc[combination] = global_auc(pred_df, event_type_col=event_type_col,
                                                                   duration_col=duration_col)
            elif metric == 'IBS':
                self.integrated_bs[combination] = events_integrated_brier_score(pred_df,
                                                                                event_type_col=event_type_col,
                                                                                duration_col=duration_col)
            elif metric == 'GBS':
                self.global_bs[combination] = global_brier_score(pred_df, event_type_col=event_type_col,
                                                                          duration_col=duration_col)

    output = self.convert_results_dict_to_df(self.global_auc).idxmax().values[0] if 'GAUC' in metrics else []
    return output

get_mixed_two_stages_fitter(self, penalizers_combination) ¤

This function creates a mixed TwoStagesFitter from the estimated meta models for a specific penalizers combination.

Parameters:

Name Type Description Default
penalizers_combination list

List with length equals to the number of competing events. The penalizers value to each of the events. Each of the values must be one of the values that was previously passed to the evaluate() method.

required

Returns:

Type Description
mixed_two_stages (pydts.fitters.TwoStagesFitter)

TwoStagesFitter for the required penalty combination.

Source code in pydts/model_selection.py
def get_mixed_two_stages_fitter(self, penalizers_combination: list) -> TwoStagesFitter:
    """
    This function creates a mixed TwoStagesFitter from the estimated meta models for a specific penalizers combination.

    Args:
        penalizers_combination (list): List with length equals to the number of competing events. The penalizers value to each of the events.
                                       Each of the values must be one of the values that was previously passed to the evaluate() method.

    Returns:
        mixed_two_stages (pydts.fitters.TwoStagesFitter): TwoStagesFitter for the required penalty combination.
    """
    _validate_estimated_value = [p for p in penalizers_combination if p not in list(self.meta_models.keys())]
    assert len(_validate_estimated_value) == 0, \
           f"Values {_validate_estimated_value} were note estimated. All the penalizers in penalizers_combination must be estimated using evaluate() before a mixed combination can be generated."

    events = self.meta_models[penalizers_combination[0]].events
    event_type_col = self.meta_models[penalizers_combination[0]].event_type_col
    if self.TwoStagesFitter_type == 'Exact':
        mixed_two_stages = TwoStagesFitterExact()
    else:
        mixed_two_stages = TwoStagesFitter()

    for ide, event in enumerate(sorted(events)):
        if ide == 0:
            mixed_two_stages.covariates = self.meta_models[penalizers_combination[ide]].covariates
            mixed_two_stages.duration_col = self.meta_models[penalizers_combination[ide]].duration_col
            mixed_two_stages.event_type_col = self.meta_models[penalizers_combination[ide]].event_type_col
            mixed_two_stages.events = self.meta_models[penalizers_combination[ide]].events
            mixed_two_stages.pid_col = self.meta_models[penalizers_combination[ide]].pid_col
            mixed_two_stages.times = self.meta_models[penalizers_combination[ide]].times

        mixed_two_stages.beta_models[event] = self.meta_models[penalizers_combination[ide]].beta_models[event]
        mixed_two_stages.event_models[event] = []
        mixed_two_stages.event_models[event].append(self.meta_models[penalizers_combination[ide]].beta_models[event])

        event_alpha = self.meta_models[penalizers_combination[ide]].alpha_df.copy()
        event_alpha = event_alpha[event_alpha[event_type_col] == event]
        mixed_two_stages.alpha_df = pd.concat([mixed_two_stages.alpha_df, event_alpha])
        mixed_two_stages.event_models[event].append(event_alpha)

    return mixed_two_stages