Comparing the Estimation Methods¤
We conducted a simulation study demonstrating the performances of Meir et al. (2022) [1] and comparing it with that of Lee et al. (2018) [2]. The PyDTS implementation was used for both methods.
The data was generated using the EventTimesSampler
with \(M=2\) competing events, \(n=20,000\) observations, \(Z\) with 10 covariates and right censoring.
Failure times were generated based on
with
\(\alpha_{1t} = -2.5 - 0.3 \log(t)\),
\(\alpha_{2t} = -2.8 - 0.3 \log(t)\), \(t=1,\ldots,d\),
\(\beta_1 = -0.5*(\log 0.8, \log 3, \log 3, \log 2.5, \log 4, \log 1, \log 3, \log 2, \log 2, \log 3)\),
\(\beta_{2} = -0.5*(\log 1, \log 3, \log 2, \log 1, \log 4, \log 3, \log 4, \log 3, \log 3, \log 4)\).
Censoring time for each observation was sampled from a discrete distribution with \(P(C_i = t) = \frac{0.1}{d}\) for \(t=1,\ldots,d\), and \(P(C_i = d+1) = 0.9\). As before, \(X=\min(C,T)\) and \(J\) is the event type with \(J_i=0\) (censoring) if and only if \(C_i < T_i\).
We repeated this procedure for \(d \in \{ 10, 20, 30, 40, 50 \}\). For each value of \(d\), the results are based on 10 replications.
We show that the computational running time of our approach is shorter depending on \(d\), where the improvement factor increases as a function of \(d\).
import pandas as pd
import numpy as np
import os
from time import time
from pydts.fitters import TwoStagesFitter, DataExpansionFitter
from pydts.data_generation import EventTimesSampler
from matplotlib import pyplot as plt
pd.set_option("display.max_rows", 500)
%matplotlib inline
slicer = pd.IndexSlice
real_coef_dict = {
"alpha": {
1: lambda t: -2.5 - 0.3 * np.log(t),
2: lambda t: -2.8 - 0.3 * np.log(t)
},
"beta": {
1: -0.5*np.log([0.8, 3, 3, 2.5, 4, 1, 3, 2, 2, 3]),
2: -0.5*np.log([1, 3, 2, 1, 4, 3, 4, 3, 3, 2])
}
}
n_patients = 20000
n_cov = 10
j_events = 2
d_times = 50
ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)
seed = 0
covariates = [f'Z{i}' for i in range(n_cov)]
COEF_COL = ' coef '
STDERR_COL = ' std err '
patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
columns=covariates))
patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_lof_at_t=(0.1 / d_times) * np.ones_like(ets.times[:-1]))
patients_df = ets.update_event_or_lof(patients_df)
patients_df.index.name='pid'
patients_df = patients_df.reset_index()
from pydts.examples_utils.plots import plot_events_occurrence
plot_events_occurrence(patients_df[patients_df['X'] != (d_times+1)])
#plot_events_occurrence(patients_df)
final_two_step = {}
final_lee = {}
for idp, d_times in enumerate(d_times_list):
case = f'timing_d{d_times}_final_'
two_step_timing = []
lee_timing = []
print(f"Starting d={d_times} times")
#print('**************************************')
for k in range(k_runs):
try:
# Sampling based on different seed each time
loop_seed = 1000*idp+k+seed
print(f'Sampling Patients, loop seed: {loop_seed}')
np.random.seed(loop_seed)
ets = EventTimesSampler(d_times=d_times, j_event_types=j_events)
patients_df = pd.DataFrame(data=pd.DataFrame(data=np.random.uniform(0,1, size=[n_patients, n_cov]),
columns=covariates))
patients_df = ets.sample_event_times(patients_df, hazard_coefs=real_coef_dict, seed=loop_seed)
patients_df = ets.sample_independent_lof_censoring(patients_df, prob_lof_at_t=(0.1 / d_times) * np.ones_like(ets.times[:-1]))
patients_df = ets.update_event_or_lof(patients_df)
patients_df.index.name='pid'
patients_df = patients_df.reset_index()
# Two step fitter
new_fitter = TwoStagesFitter()
print(f'Starting two-step: {k+1}/{k_runs}')
two_step_start = time()
new_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1), nb_workers=1)
two_step_end = time()
print(f'Finished two-step: {k+1}/{k_runs}, {two_step_end-two_step_start}sec')
two_step_timing.append(two_step_end-two_step_start)
# Lee et al fitter
lee_fitter = DataExpansionFitter()
print(f'Starting Lee: {k+1}/{k_runs}')
lee_start = time()
lee_fitter.fit(df=patients_df.drop(['C', 'T'], axis=1))
lee_end = time()
print(f'Finished lee: {k+1}/{k_runs}, {lee_end-lee_start}sec')
lee_timing.append(lee_end-lee_start)
lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
if k == 0:
two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()
lee_alpha_k_results = lee_alpha_ser.to_frame()
lee_beta_k_results = lee_beta_ser.to_frame()
else:
two_step_alpha_k_results = pd.concat([two_step_alpha_k_results, new_fitter.alpha_df['alpha_jt']], axis=1)
two_step_beta_k_results = pd.concat([two_step_beta_k_results, new_fitter.get_beta_SE().unstack()], axis=1)
lee_alpha_k_results = pd.concat([lee_alpha_k_results, lee_alpha_ser], axis=1)
lee_beta_k_results = pd.concat([lee_beta_k_results, lee_beta_ser], axis=1)
except Exception as e:
print(f'Failed during trial {k}')
print(e)
two_step_alpha_k_results = two_step_alpha_k_results.set_index(['J', 'X'])
two_step_alpha_k_results.columns = list(range(1, 1+k_runs))
two_step_beta_k_results.columns = list(range(1, 1+k_runs))
lee_alpha_k_results.columns = list(range(1, 1+k_runs))
lee_beta_k_results.columns = list(range(1, 1+k_runs))
final_two_step[d_times] = two_step_timing
final_lee[d_times] = lee_timing
summary_df = pd.DataFrame(index=d_times_list, columns=['Lee mean', 'Lee std',
'two-step mean', 'two-step std'])
lee_results_df = pd.DataFrame(columns=d_times_list, index=range(1,k_runs+1))
two_step_results_df = pd.DataFrame(columns=d_times_list, index=range(1,k_runs+1))
for idk, k in enumerate(d_times_list):
summary_df.loc[k, 'Lee mean'] = np.mean(final_lee[k])
summary_df.loc[k, 'Lee std'] = np.std(final_lee[k])
summary_df.loc[k, 'two-step mean'] = np.mean(final_two_step[k])
summary_df.loc[k, 'two-step std'] = np.std(final_two_step[k])
lee_results_df.loc[:, k] = final_lee[k]
two_step_results_df.loc[:, k] = final_two_step[k]
summary_df['ratio'] = summary_df['Lee mean'] / summary_df['two-step mean']
summary_df
filename = 'fitting_time_comparison.png'
fig, ax = plt.subplots(1, 1, figsize=(6,4))
ax.set_title('n=20000, p=10', fontsize=16)
flierprops = dict(marker='.', markersize=4)
lee_boxprops = dict(color='darkgreen')
lee_medianprops = dict(color='darkgreen')
two_step_boxprops = dict(color='navy')
two_step_medianprops = dict(color='navy')
ax.boxplot(lee_results_df, vert=True, positions=lee_results_df.columns, whis=1.5, flierprops=flierprops,
widths=8, boxprops=lee_boxprops, medianprops=lee_medianprops)
ax.boxplot(two_step_results_df, vert=True, positions=two_step_results_df.columns, whis=1.5, flierprops=flierprops,
widths=8, boxprops=two_step_boxprops, medianprops=two_step_medianprops)
ax.set_xlabel('Number of Discrete Times', fontsize=16)
ax.set_ylabel('Fitting Time [seconds]', fontsize=16)
ax.set_xticks(d_times_list)
ax.set_xticklabels(d_times_list)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
ax.set_xlim([5,55])
ax.set_ylim([0, 360])
# ax.legend()
leg = ax.legend(['Lee et al.', 'two-step'], handlelength=0, handletextpad=0)
color_l = ['darkgreen', 'navy']
for n, text in enumerate( leg.texts ):
text.set_color( color_l[n] )
ax.grid(alpha=0.5)
fig.tight_layout()
if filename is not None:
fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)
References
[1] Meir, Tomer, Gutman, Rom, and Gorfine, Malka, “PyDTS: A Python Package for Discrete-Time Survival Analysis with Competing Risks” (2022)
[2] Lee, Minjung and Feuer, Eric J. and Fine, Jason P., “On the analysis of discrete time competing risks data”, Biometrics (2018) doi: 10.1111/biom.12881